2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
12 #include "unicode/ucsdet.h"
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
37 struct CSRecognizerInfo
: public UMemory
{
38 CSRecognizerInfo(CharsetRecognizer
*recognizer
, UBool isDefaultEnabled
)
39 : recognizer(recognizer
), isDefaultEnabled(isDefaultEnabled
) {};
41 ~CSRecognizerInfo() {delete recognizer
;};
43 CharsetRecognizer
*recognizer
;
44 UBool isDefaultEnabled
;
49 static icu::CSRecognizerInfo
**fCSRecognizers
= NULL
;
50 static icu::UInitOnce gCSRecognizersInitOnce
;
51 static int32_t fCSRecognizers_size
= 0;
54 static UBool U_CALLCONV
csdet_cleanup(void)
57 if (fCSRecognizers
!= NULL
) {
58 for(int32_t r
= 0; r
< fCSRecognizers_size
; r
+= 1) {
59 delete fCSRecognizers
[r
];
60 fCSRecognizers
[r
] = NULL
;
63 DELETE_ARRAY(fCSRecognizers
);
64 fCSRecognizers
= NULL
;
65 fCSRecognizers_size
= 0;
67 gCSRecognizersInitOnce
.reset();
72 static int32_t U_CALLCONV
73 charsetMatchComparator(const void * /*context*/, const void *left
, const void *right
)
77 const CharsetMatch
**csm_l
= (const CharsetMatch
**) left
;
78 const CharsetMatch
**csm_r
= (const CharsetMatch
**) right
;
80 // NOTE: compare is backwards to sort from highest to lowest.
81 return (*csm_r
)->getConfidence() - (*csm_l
)->getConfidence();
84 static void U_CALLCONV
initRecognizers(UErrorCode
&status
) {
86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET
, csdet_cleanup
);
87 CSRecognizerInfo
*tempArray
[] = {
88 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE
),
90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE
),
91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE
),
92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE
),
93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE
),
95 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE
),
96 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE
),
97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE
),
98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE
),
99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE
),
100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE
),
101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE
),
102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE
),
103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE
),
104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE
),
105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE
),
106 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE
),
107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE
),
108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE
),
109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE
),
110 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE
),
112 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE
),
113 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE
),
114 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE
),
116 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE
),
117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE
),
118 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE
),
119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE
)
121 int32_t rCount
= ARRAY_SIZE(tempArray
);
123 fCSRecognizers
= NEW_ARRAY(CSRecognizerInfo
*, rCount
);
125 if (fCSRecognizers
== NULL
) {
126 status
= U_MEMORY_ALLOCATION_ERROR
;
129 fCSRecognizers_size
= rCount
;
130 for (int32_t r
= 0; r
< rCount
; r
+= 1) {
131 fCSRecognizers
[r
] = tempArray
[r
];
132 if (fCSRecognizers
[r
] == NULL
) {
133 status
= U_MEMORY_ALLOCATION_ERROR
;
143 void CharsetDetector::setRecognizers(UErrorCode
&status
)
145 umtx_initOnce(gCSRecognizersInitOnce
, &initRecognizers
, status
);
148 CharsetDetector::CharsetDetector(UErrorCode
&status
)
149 : textIn(new InputText(status
)), resultArray(NULL
),
150 resultCount(0), fStripTags(FALSE
), fFreshTextSet(FALSE
),
151 fEnabledRecognizers(NULL
)
153 if (U_FAILURE(status
)) {
157 setRecognizers(status
);
159 if (U_FAILURE(status
)) {
163 resultArray
= (CharsetMatch
**)uprv_malloc(sizeof(CharsetMatch
*)*fCSRecognizers_size
);
165 if (resultArray
== NULL
) {
166 status
= U_MEMORY_ALLOCATION_ERROR
;
170 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
171 resultArray
[i
] = new CharsetMatch();
173 if (resultArray
[i
] == NULL
) {
174 status
= U_MEMORY_ALLOCATION_ERROR
;
180 CharsetDetector::~CharsetDetector()
184 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
185 delete resultArray
[i
];
188 uprv_free(resultArray
);
190 if (fEnabledRecognizers
) {
191 uprv_free(fEnabledRecognizers
);
195 void CharsetDetector::setText(const char *in
, int32_t len
)
197 textIn
->setText(in
, len
);
198 fFreshTextSet
= TRUE
;
201 UBool
CharsetDetector::setStripTagsFlag(UBool flag
)
203 UBool temp
= fStripTags
;
205 fFreshTextSet
= TRUE
;
209 UBool
CharsetDetector::getStripTagsFlag() const
214 void CharsetDetector::setDeclaredEncoding(const char *encoding
, int32_t len
) const
216 textIn
->setDeclaredEncoding(encoding
,len
);
219 int32_t CharsetDetector::getDetectableCount()
221 UErrorCode status
= U_ZERO_ERROR
;
223 setRecognizers(status
);
225 return fCSRecognizers_size
;
228 const CharsetMatch
*CharsetDetector::detect(UErrorCode
&status
)
230 int32_t maxMatchesFound
= 0;
232 detectAll(maxMatchesFound
, status
);
234 if(maxMatchesFound
> 0) {
235 return resultArray
[0];
241 const CharsetMatch
* const *CharsetDetector::detectAll(int32_t &maxMatchesFound
, UErrorCode
&status
)
243 if(!textIn
->isSet()) {
244 status
= U_MISSING_RESOURCE_ERROR
;// TODO: Need to set proper status code for input text not set
247 } else if (fFreshTextSet
) {
248 CharsetRecognizer
*csr
;
251 textIn
->MungeInput(fStripTags
);
253 // Iterate over all possible charsets, remember all that
254 // give a match quality > 0.
256 for (i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
257 csr
= fCSRecognizers
[i
]->recognizer
;
258 if (csr
->match(textIn
, resultArray
[resultCount
])) {
263 if (resultCount
> 1) {
264 uprv_sortArray(resultArray
, resultCount
, sizeof resultArray
[0], charsetMatchComparator
, NULL
, TRUE
, &status
);
266 fFreshTextSet
= FALSE
;
269 maxMatchesFound
= resultCount
;
274 void CharsetDetector::setDetectableCharset(const char *encoding
, UBool enabled
, UErrorCode
&status
)
276 if (U_FAILURE(status
)) {
281 UBool isDefaultVal
= FALSE
;
282 for (int32_t i
= 0; i
< fCSRecognizers_size
; i
++) {
283 CSRecognizerInfo
*csrinfo
= fCSRecognizers
[i
];
284 if (uprv_strcmp(csrinfo
->recognizer
->getName(), encoding
) == 0) {
286 isDefaultVal
= (csrinfo
->isDefaultEnabled
== enabled
);
291 // No matching encoding found
292 status
= U_ILLEGAL_ARGUMENT_ERROR
;
296 if (fEnabledRecognizers
== NULL
&& !isDefaultVal
) {
297 // Create an array storing the non default setting
298 fEnabledRecognizers
= NEW_ARRAY(UBool
, fCSRecognizers_size
);
299 if (fEnabledRecognizers
== NULL
) {
300 status
= U_MEMORY_ALLOCATION_ERROR
;
303 // Initialize the array with default info
304 for (int32_t i
= 0; i
< fCSRecognizers_size
; i
++) {
305 fEnabledRecognizers
[i
] = fCSRecognizers
[i
]->isDefaultEnabled
;
309 if (fEnabledRecognizers
!= NULL
) {
310 fEnabledRecognizers
[modIdx
] = enabled
;
314 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
316 if( index > fCSRecognizers_size-1 || index < 0) {
317 status = U_INDEX_OUTOFBOUNDS_ERROR;
321 return fCSRecognizers[index]->getName();
331 UBool
*enabledRecognizers
;
336 static void U_CALLCONV
337 enumClose(UEnumeration
*en
) {
338 if(en
->context
!= NULL
) {
339 DELETE_ARRAY(en
->context
);
345 static int32_t U_CALLCONV
346 enumCount(UEnumeration
*en
, UErrorCode
*) {
347 if (((Context
*)en
->context
)->all
) {
348 // ucsdet_getAllDetectableCharsets, all charset detector names
349 return fCSRecognizers_size
;
352 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
354 UBool
*enabledArray
= ((Context
*)en
->context
)->enabledRecognizers
;
355 if (enabledArray
!= NULL
) {
357 for (int32_t i
= 0; i
< fCSRecognizers_size
; i
++) {
358 if (enabledArray
[i
]) {
364 for (int32_t i
= 0; i
< fCSRecognizers_size
; i
++) {
365 if (fCSRecognizers
[i
]->isDefaultEnabled
) {
373 static const char* U_CALLCONV
374 enumNext(UEnumeration
*en
, int32_t *resultLength
, UErrorCode
* /*status*/) {
375 const char *currName
= NULL
;
377 if (((Context
*)en
->context
)->currIndex
< fCSRecognizers_size
) {
378 if (((Context
*)en
->context
)->all
) {
379 // ucsdet_getAllDetectableCharsets, all charset detector names
380 currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->recognizer
->getName();
381 ((Context
*)en
->context
)->currIndex
++;
383 // ucsdet_getDetectableCharsets
384 UBool
*enabledArray
= ((Context
*)en
->context
)->enabledRecognizers
;
385 if (enabledArray
!= NULL
) {
387 while (currName
== NULL
&& ((Context
*)en
->context
)->currIndex
< fCSRecognizers_size
) {
388 if (enabledArray
[((Context
*)en
->context
)->currIndex
]) {
389 currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->recognizer
->getName();
391 ((Context
*)en
->context
)->currIndex
++;
395 while (currName
== NULL
&& ((Context
*)en
->context
)->currIndex
< fCSRecognizers_size
) {
396 if (fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->isDefaultEnabled
) {
397 currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->recognizer
->getName();
399 ((Context
*)en
->context
)->currIndex
++;
405 if(resultLength
!= NULL
) {
406 *resultLength
= currName
== NULL
? 0 : (int32_t)uprv_strlen(currName
);
413 static void U_CALLCONV
414 enumReset(UEnumeration
*en
, UErrorCode
*) {
415 ((Context
*)en
->context
)->currIndex
= 0;
418 static const UEnumeration gCSDetEnumeration
= {
432 UEnumeration
* CharsetDetector::getAllDetectableCharsets(UErrorCode
&status
)
435 /* Initialize recognized charsets. */
436 setRecognizers(status
);
438 if(U_FAILURE(status
)) {
442 UEnumeration
*en
= NEW_ARRAY(UEnumeration
, 1);
444 status
= U_MEMORY_ALLOCATION_ERROR
;
447 memcpy(en
, &gCSDetEnumeration
, sizeof(UEnumeration
));
448 en
->context
= (void*)NEW_ARRAY(Context
, 1);
449 if (en
->context
== NULL
) {
450 status
= U_MEMORY_ALLOCATION_ERROR
;
454 uprv_memset(en
->context
, 0, sizeof(Context
));
455 ((Context
*)en
->context
)->all
= TRUE
;
459 UEnumeration
* CharsetDetector::getDetectableCharsets(UErrorCode
&status
) const
461 if(U_FAILURE(status
)) {
465 UEnumeration
*en
= NEW_ARRAY(UEnumeration
, 1);
467 status
= U_MEMORY_ALLOCATION_ERROR
;
470 memcpy(en
, &gCSDetEnumeration
, sizeof(UEnumeration
));
471 en
->context
= (void*)NEW_ARRAY(Context
, 1);
472 if (en
->context
== NULL
) {
473 status
= U_MEMORY_ALLOCATION_ERROR
;
477 uprv_memset(en
->context
, 0, sizeof(Context
));
478 ((Context
*)en
->context
)->all
= FALSE
;
479 ((Context
*)en
->context
)->enabledRecognizers
= fEnabledRecognizers
;