1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_CONVERSION
14 #include "unicode/ucsdet.h"
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
37 struct CSRecognizerInfo
: public UMemory
{
38 CSRecognizerInfo(CharsetRecognizer
*recognizer
, UBool isDefaultEnabled
)
39 : recognizer(recognizer
), isDefaultEnabled(isDefaultEnabled
) {};
41 ~CSRecognizerInfo() {delete recognizer
;};
43 CharsetRecognizer
*recognizer
;
44 UBool isDefaultEnabled
;
49 static icu::CSRecognizerInfo
**fCSRecognizers
= NULL
;
50 static icu::UInitOnce gCSRecognizersInitOnce
;
51 static int32_t fCSRecognizers_size
= 0;
54 static UBool U_CALLCONV
csdet_cleanup(void)
57 if (fCSRecognizers
!= NULL
) {
58 for(int32_t r
= 0; r
< fCSRecognizers_size
; r
+= 1) {
59 delete fCSRecognizers
[r
];
60 fCSRecognizers
[r
] = NULL
;
63 DELETE_ARRAY(fCSRecognizers
);
64 fCSRecognizers
= NULL
;
65 fCSRecognizers_size
= 0;
67 gCSRecognizersInitOnce
.reset();
72 static int32_t U_CALLCONV
73 charsetMatchComparator(const void * /*context*/, const void *left
, const void *right
)
77 const CharsetMatch
**csm_l
= (const CharsetMatch
**) left
;
78 const CharsetMatch
**csm_r
= (const CharsetMatch
**) right
;
80 // NOTE: compare is backwards to sort from highest to lowest.
81 return (*csm_r
)->getConfidence() - (*csm_l
)->getConfidence();
84 static void U_CALLCONV
initRecognizers(UErrorCode
&status
) {
86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET
, csdet_cleanup
);
87 CSRecognizerInfo
*tempArray
[] = {
88 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE
),
90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE
),
91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE
),
92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE
),
93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE
),
95 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE
),
96 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE
),
97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE
),
98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE
),
99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE
),
100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE
),
101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE
),
102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE
),
103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE
),
104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE
),
105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE
),
106 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE
),
107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE
),
108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE
),
109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE
),
110 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE
),
112 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE
),
113 #if !UCONFIG_ONLY_HTML_CONVERSION
114 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE
),
115 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE
),
117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE
),
118 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE
),
119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE
),
120 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE
)
123 int32_t rCount
= UPRV_LENGTHOF(tempArray
);
125 fCSRecognizers
= NEW_ARRAY(CSRecognizerInfo
*, rCount
);
127 if (fCSRecognizers
== NULL
) {
128 status
= U_MEMORY_ALLOCATION_ERROR
;
131 fCSRecognizers_size
= rCount
;
132 for (int32_t r
= 0; r
< rCount
; r
+= 1) {
133 fCSRecognizers
[r
] = tempArray
[r
];
134 if (fCSRecognizers
[r
] == NULL
) {
135 status
= U_MEMORY_ALLOCATION_ERROR
;
145 void CharsetDetector::setRecognizers(UErrorCode
&status
)
147 umtx_initOnce(gCSRecognizersInitOnce
, &initRecognizers
, status
);
150 CharsetDetector::CharsetDetector(UErrorCode
&status
)
151 : textIn(new InputText(status
)), resultArray(NULL
),
152 resultCount(0), fStripTags(FALSE
), fFreshTextSet(FALSE
),
153 fEnabledRecognizers(NULL
)
155 if (U_FAILURE(status
)) {
159 setRecognizers(status
);
161 if (U_FAILURE(status
)) {
165 resultArray
= (CharsetMatch
**)uprv_malloc(sizeof(CharsetMatch
*)*fCSRecognizers_size
);
167 if (resultArray
== NULL
) {
168 status
= U_MEMORY_ALLOCATION_ERROR
;
172 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
173 resultArray
[i
] = new CharsetMatch();
175 if (resultArray
[i
] == NULL
) {
176 status
= U_MEMORY_ALLOCATION_ERROR
;
182 CharsetDetector::~CharsetDetector()
186 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
187 delete resultArray
[i
];
190 uprv_free(resultArray
);
192 if (fEnabledRecognizers
) {
193 uprv_free(fEnabledRecognizers
);
197 void CharsetDetector::setText(const char *in
, int32_t len
)
199 textIn
->setText(in
, len
);
200 fFreshTextSet
= TRUE
;
203 UBool
CharsetDetector::setStripTagsFlag(UBool flag
)
205 UBool temp
= fStripTags
;
207 fFreshTextSet
= TRUE
;
211 UBool
CharsetDetector::getStripTagsFlag() const
216 void CharsetDetector::setDeclaredEncoding(const char *encoding
, int32_t len
) const
218 textIn
->setDeclaredEncoding(encoding
,len
);
221 int32_t CharsetDetector::getDetectableCount()
223 UErrorCode status
= U_ZERO_ERROR
;
225 setRecognizers(status
);
227 return fCSRecognizers_size
;
230 const CharsetMatch
*CharsetDetector::detect(UErrorCode
&status
)
232 int32_t maxMatchesFound
= 0;
234 detectAll(maxMatchesFound
, status
);
236 if(maxMatchesFound
> 0) {
237 return resultArray
[0];
243 const CharsetMatch
* const *CharsetDetector::detectAll(int32_t &maxMatchesFound
, UErrorCode
&status
)
245 if(!textIn
->isSet()) {
246 status
= U_MISSING_RESOURCE_ERROR
;// TODO: Need to set proper status code for input text not set
249 } else if (fFreshTextSet
) {
250 CharsetRecognizer
*csr
;
253 textIn
->MungeInput(fStripTags
);
255 // Iterate over all possible charsets, remember all that
256 // give a match quality > 0.
258 for (i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
259 csr
= fCSRecognizers
[i
]->recognizer
;
260 if (csr
->match(textIn
, resultArray
[resultCount
])) {
265 if (resultCount
> 1) {
266 uprv_sortArray(resultArray
, resultCount
, sizeof resultArray
[0], charsetMatchComparator
, NULL
, TRUE
, &status
);
268 fFreshTextSet
= FALSE
;
271 maxMatchesFound
= resultCount
;
276 void CharsetDetector::setDetectableCharset(const char *encoding
, UBool enabled
, UErrorCode
&status
)
278 if (U_FAILURE(status
)) {
283 UBool isDefaultVal
= FALSE
;
284 for (int32_t i
= 0; i
< fCSRecognizers_size
; i
++) {
285 CSRecognizerInfo
*csrinfo
= fCSRecognizers
[i
];
286 if (uprv_strcmp(csrinfo
->recognizer
->getName(), encoding
) == 0) {
288 isDefaultVal
= (csrinfo
->isDefaultEnabled
== enabled
);
293 // No matching encoding found
294 status
= U_ILLEGAL_ARGUMENT_ERROR
;
298 if (fEnabledRecognizers
== NULL
&& !isDefaultVal
) {
299 // Create an array storing the non default setting
300 fEnabledRecognizers
= NEW_ARRAY(UBool
, fCSRecognizers_size
);
301 if (fEnabledRecognizers
== NULL
) {
302 status
= U_MEMORY_ALLOCATION_ERROR
;
305 // Initialize the array with default info
306 for (int32_t i
= 0; i
< fCSRecognizers_size
; i
++) {
307 fEnabledRecognizers
[i
] = fCSRecognizers
[i
]->isDefaultEnabled
;
311 if (fEnabledRecognizers
!= NULL
) {
312 fEnabledRecognizers
[modIdx
] = enabled
;
316 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
318 if( index > fCSRecognizers_size-1 || index < 0) {
319 status = U_INDEX_OUTOFBOUNDS_ERROR;
323 return fCSRecognizers[index]->getName();
333 UBool
*enabledRecognizers
;
338 static void U_CALLCONV
339 enumClose(UEnumeration
*en
) {
340 if(en
->context
!= NULL
) {
341 DELETE_ARRAY(en
->context
);
347 static int32_t U_CALLCONV
348 enumCount(UEnumeration
*en
, UErrorCode
*) {
349 if (((Context
*)en
->context
)->all
) {
350 // ucsdet_getAllDetectableCharsets, all charset detector names
351 return fCSRecognizers_size
;
354 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
356 UBool
*enabledArray
= ((Context
*)en
->context
)->enabledRecognizers
;
357 if (enabledArray
!= NULL
) {
359 for (int32_t i
= 0; i
< fCSRecognizers_size
; i
++) {
360 if (enabledArray
[i
]) {
366 for (int32_t i
= 0; i
< fCSRecognizers_size
; i
++) {
367 if (fCSRecognizers
[i
]->isDefaultEnabled
) {
375 static const char* U_CALLCONV
376 enumNext(UEnumeration
*en
, int32_t *resultLength
, UErrorCode
* /*status*/) {
377 const char *currName
= NULL
;
379 if (((Context
*)en
->context
)->currIndex
< fCSRecognizers_size
) {
380 if (((Context
*)en
->context
)->all
) {
381 // ucsdet_getAllDetectableCharsets, all charset detector names
382 currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->recognizer
->getName();
383 ((Context
*)en
->context
)->currIndex
++;
385 // ucsdet_getDetectableCharsets
386 UBool
*enabledArray
= ((Context
*)en
->context
)->enabledRecognizers
;
387 if (enabledArray
!= NULL
) {
389 while (currName
== NULL
&& ((Context
*)en
->context
)->currIndex
< fCSRecognizers_size
) {
390 if (enabledArray
[((Context
*)en
->context
)->currIndex
]) {
391 currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->recognizer
->getName();
393 ((Context
*)en
->context
)->currIndex
++;
397 while (currName
== NULL
&& ((Context
*)en
->context
)->currIndex
< fCSRecognizers_size
) {
398 if (fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->isDefaultEnabled
) {
399 currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->recognizer
->getName();
401 ((Context
*)en
->context
)->currIndex
++;
407 if(resultLength
!= NULL
) {
408 *resultLength
= currName
== NULL
? 0 : (int32_t)uprv_strlen(currName
);
415 static void U_CALLCONV
416 enumReset(UEnumeration
*en
, UErrorCode
*) {
417 ((Context
*)en
->context
)->currIndex
= 0;
420 static const UEnumeration gCSDetEnumeration
= {
434 UEnumeration
* CharsetDetector::getAllDetectableCharsets(UErrorCode
&status
)
437 /* Initialize recognized charsets. */
438 setRecognizers(status
);
440 if(U_FAILURE(status
)) {
444 UEnumeration
*en
= NEW_ARRAY(UEnumeration
, 1);
446 status
= U_MEMORY_ALLOCATION_ERROR
;
449 memcpy(en
, &gCSDetEnumeration
, sizeof(UEnumeration
));
450 en
->context
= (void*)NEW_ARRAY(Context
, 1);
451 if (en
->context
== NULL
) {
452 status
= U_MEMORY_ALLOCATION_ERROR
;
456 uprv_memset(en
->context
, 0, sizeof(Context
));
457 ((Context
*)en
->context
)->all
= TRUE
;
461 UEnumeration
* CharsetDetector::getDetectableCharsets(UErrorCode
&status
) const
463 if(U_FAILURE(status
)) {
467 UEnumeration
*en
= NEW_ARRAY(UEnumeration
, 1);
469 status
= U_MEMORY_ALLOCATION_ERROR
;
472 memcpy(en
, &gCSDetEnumeration
, sizeof(UEnumeration
));
473 en
->context
= (void*)NEW_ARRAY(Context
, 1);
474 if (en
->context
== NULL
) {
475 status
= U_MEMORY_ALLOCATION_ERROR
;
479 uprv_memset(en
->context
, 0, sizeof(Context
));
480 ((Context
*)en
->context
)->all
= FALSE
;
481 ((Context
*)en
->context
)->enabledRecognizers
= fEnabledRecognizers
;