2 **********************************************************************
3 * Copyright (C) 2005-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
12 #include "unicode/ucsdet.h"
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
36 static U_NAMESPACE_QUALIFIER CharsetRecognizer
**fCSRecognizers
= NULL
;
38 static int32_t fCSRecognizers_size
= 0;
40 static UBool U_CALLCONV
csdet_cleanup(void)
42 if (fCSRecognizers
!= NULL
) {
43 for(int32_t r
= 0; r
< fCSRecognizers_size
; r
+= 1) {
44 delete fCSRecognizers
[r
];
45 fCSRecognizers
[r
] = NULL
;
48 DELETE_ARRAY(fCSRecognizers
);
49 fCSRecognizers
= NULL
;
50 fCSRecognizers_size
= 0;
56 static int32_t U_CALLCONV
57 charsetMatchComparator(const void * /*context*/, const void *left
, const void *right
)
61 const CharsetMatch
**csm_l
= (const CharsetMatch
**) left
;
62 const CharsetMatch
**csm_r
= (const CharsetMatch
**) right
;
64 // NOTE: compare is backwards to sort from highest to lowest.
65 return (*csm_r
)->getConfidence() - (*csm_l
)->getConfidence();
72 void CharsetDetector::setRecognizers(UErrorCode
&status
)
75 CharsetRecognizer
**recognizers
;
77 if (U_FAILURE(status
)) {
81 UMTX_CHECK(NULL
, (UBool
) (fCSRecognizers
== NULL
), needsInit
);
84 CharsetRecognizer
*tempArray
[] = {
85 new CharsetRecog_UTF8(),
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
92 new CharsetRecog_8859_1_en(),
93 new CharsetRecog_8859_1_da(),
94 new CharsetRecog_8859_1_de(),
95 new CharsetRecog_8859_1_es(),
96 new CharsetRecog_8859_1_fr(),
97 new CharsetRecog_8859_1_it(),
98 new CharsetRecog_8859_1_nl(),
99 new CharsetRecog_8859_1_no(),
100 new CharsetRecog_8859_1_pt(),
101 new CharsetRecog_8859_1_sv(),
102 new CharsetRecog_8859_2_cs(),
103 new CharsetRecog_8859_2_hu(),
104 new CharsetRecog_8859_2_pl(),
105 new CharsetRecog_8859_2_ro(),
106 new CharsetRecog_8859_5_ru(),
107 new CharsetRecog_8859_6_ar(),
108 new CharsetRecog_8859_7_el(),
109 new CharsetRecog_8859_8_I_he(),
110 new CharsetRecog_8859_8_he(),
111 new CharsetRecog_windows_1251(),
112 new CharsetRecog_windows_1256(),
113 new CharsetRecog_KOI8_R(),
114 new CharsetRecog_8859_9_tr(),
115 new CharsetRecog_sjis(),
116 new CharsetRecog_gb_18030(),
117 new CharsetRecog_euc_jp(),
118 new CharsetRecog_euc_kr(),
119 new CharsetRecog_big5(),
121 new CharsetRecog_2022JP(),
122 new CharsetRecog_2022KR(),
123 new CharsetRecog_2022CN(),
125 new CharsetRecog_IBM424_he_rtl(),
126 new CharsetRecog_IBM424_he_ltr(),
127 new CharsetRecog_IBM420_ar_rtl(),
128 new CharsetRecog_IBM420_ar_ltr()
130 int32_t rCount
= ARRAY_SIZE(tempArray
);
133 recognizers
= NEW_ARRAY(CharsetRecognizer
*, rCount
);
135 if (recognizers
== NULL
) {
136 status
= U_MEMORY_ALLOCATION_ERROR
;
139 for (r
= 0; r
< rCount
; r
+= 1) {
140 recognizers
[r
] = tempArray
[r
];
142 if (recognizers
[r
] == NULL
) {
143 status
= U_MEMORY_ALLOCATION_ERROR
;
149 if (U_SUCCESS(status
)) {
151 if (fCSRecognizers
== NULL
) {
152 fCSRecognizers_size
= rCount
;
153 fCSRecognizers
= recognizers
;
158 if (fCSRecognizers
!= recognizers
) {
159 for (r
= 0; r
< rCount
; r
+= 1) {
160 delete recognizers
[r
];
161 recognizers
[r
] = NULL
;
164 DELETE_ARRAY(recognizers
);
168 ucln_i18n_registerCleanup(UCLN_I18N_CSDET
, csdet_cleanup
);
172 CharsetDetector::CharsetDetector(UErrorCode
&status
)
173 : textIn(new InputText(status
)), resultArray(NULL
),
174 resultCount(0), fStripTags(FALSE
), fFreshTextSet(FALSE
)
176 if (U_FAILURE(status
)) {
180 setRecognizers(status
);
182 if (U_FAILURE(status
)) {
186 resultArray
= (CharsetMatch
**)uprv_malloc(sizeof(CharsetMatch
*)*fCSRecognizers_size
);
188 if (resultArray
== NULL
) {
189 status
= U_MEMORY_ALLOCATION_ERROR
;
193 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
194 resultArray
[i
] = new CharsetMatch();
196 if (resultArray
[i
] == NULL
) {
197 status
= U_MEMORY_ALLOCATION_ERROR
;
203 CharsetDetector::~CharsetDetector()
207 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
208 delete resultArray
[i
];
211 uprv_free(resultArray
);
214 void CharsetDetector::setText(const char *in
, int32_t len
)
216 textIn
->setText(in
, len
);
217 fFreshTextSet
= TRUE
;
220 UBool
CharsetDetector::setStripTagsFlag(UBool flag
)
222 UBool temp
= fStripTags
;
224 fFreshTextSet
= TRUE
;
228 UBool
CharsetDetector::getStripTagsFlag() const
233 void CharsetDetector::setDeclaredEncoding(const char *encoding
, int32_t len
) const
235 textIn
->setDeclaredEncoding(encoding
,len
);
238 int32_t CharsetDetector::getDetectableCount()
240 UErrorCode status
= U_ZERO_ERROR
;
242 setRecognizers(status
);
244 return fCSRecognizers_size
;
247 const CharsetMatch
*CharsetDetector::detect(UErrorCode
&status
)
249 int32_t maxMatchesFound
= 0;
251 detectAll(maxMatchesFound
, status
);
253 if(maxMatchesFound
> 0) {
254 return resultArray
[0];
260 const CharsetMatch
* const *CharsetDetector::detectAll(int32_t &maxMatchesFound
, UErrorCode
&status
)
262 if(!textIn
->isSet()) {
263 status
= U_MISSING_RESOURCE_ERROR
;// TODO: Need to set proper status code for input text not set
266 } else if(fFreshTextSet
) {
267 CharsetRecognizer
*csr
;
268 int32_t detectResults
;
272 textIn
->MungeInput(fStripTags
);
274 // Iterate over all possible charsets, remember all that
275 // give a match quality > 0.
277 for (i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
278 csr
= fCSRecognizers
[i
];
279 detectResults
= csr
->match(textIn
);
280 confidence
= detectResults
;
282 if (confidence
> 0) {
283 resultArray
[resultCount
++]->set(textIn
, csr
, confidence
);
287 for(i
= resultCount
; i
< fCSRecognizers_size
; i
+= 1) {
288 resultArray
[i
]->set(textIn
, 0, 0);
291 uprv_sortArray(resultArray
, resultCount
, sizeof resultArray
[0], charsetMatchComparator
, NULL
, TRUE
, &status
);
293 // Remove duplicate charsets from the results.
294 // Simple minded, brute force approach - check each entry against all that follow.
295 // The first entry of any duplicated set is the one that should be kept because it will
296 // be the one with the highest confidence rating.
297 // (Duplicate matches have different languages, only the charset is the same)
298 // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
299 // deleted, just reordered, with the unwanted duplicates placed after the good results.
301 for (i
=0; i
<resultCount
; i
++) {
302 const char *charSetName
= resultArray
[i
]->getName();
303 for (j
=i
+1; j
<resultCount
; ) {
304 if (uprv_strcmp(charSetName
, resultArray
[j
]->getName()) != 0) {
308 // Duplicate entry at index j.
309 CharsetMatch
*duplicate
= resultArray
[j
];
310 for (k
=j
; k
<resultCount
-1; k
++) {
311 resultArray
[k
] = resultArray
[k
+1];
314 resultArray
[resultCount
] = duplicate
;
319 fFreshTextSet
= FALSE
;
322 maxMatchesFound
= resultCount
;
327 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
329 if( index > fCSRecognizers_size-1 || index < 0) {
330 status = U_INDEX_OUTOFBOUNDS_ERROR;
334 return fCSRecognizers[index]->getName();
347 static void U_CALLCONV
348 enumClose(UEnumeration
*en
) {
349 if(en
->context
!= NULL
) {
350 DELETE_ARRAY(en
->context
);
356 static int32_t U_CALLCONV
357 enumCount(UEnumeration
*, UErrorCode
*) {
358 return fCSRecognizers_size
;
361 static const char* U_CALLCONV
362 enumNext(UEnumeration
*en
, int32_t *resultLength
, UErrorCode
* /*status*/) {
363 if(((Context
*)en
->context
)->currIndex
>= fCSRecognizers_size
) {
364 if(resultLength
!= NULL
) {
369 const char *currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->getName();
370 if(resultLength
!= NULL
) {
371 *resultLength
= (int32_t)uprv_strlen(currName
);
373 ((Context
*)en
->context
)->currIndex
++;
378 static void U_CALLCONV
379 enumReset(UEnumeration
*en
, UErrorCode
*) {
380 ((Context
*)en
->context
)->currIndex
= 0;
383 static const UEnumeration gCSDetEnumeration
= {
393 U_CAPI UEnumeration
* U_EXPORT2
394 ucsdet_getAllDetectableCharsets(const UCharsetDetector
* /*ucsd*/, UErrorCode
*status
)
398 if(U_FAILURE(*status
)) {
402 /* Initialize recognized charsets. */
403 CharsetDetector::getDetectableCount();
405 UEnumeration
*en
= NEW_ARRAY(UEnumeration
, 1);
406 memcpy(en
, &gCSDetEnumeration
, sizeof(UEnumeration
));
407 en
->context
= (void*)NEW_ARRAY(Context
, 1);
408 uprv_memset(en
->context
, 0, sizeof(Context
));