2 **********************************************************************
3 * Copyright (C) 2005-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
12 #include "unicode/ucsdet.h"
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
36 static U_NAMESPACE_QUALIFIER CharsetRecognizer
**fCSRecognizers
= NULL
;
38 static int32_t fCSRecognizers_size
= 0;
40 static UBool U_CALLCONV
csdet_cleanup(void)
42 if (fCSRecognizers
!= NULL
) {
43 for(int32_t r
= 0; r
< fCSRecognizers_size
; r
+= 1) {
44 delete fCSRecognizers
[r
];
45 fCSRecognizers
[r
] = NULL
;
48 DELETE_ARRAY(fCSRecognizers
);
49 fCSRecognizers
= NULL
;
50 fCSRecognizers_size
= 0;
56 static int32_t U_CALLCONV
57 charsetMatchComparator(const void * /*context*/, const void *left
, const void *right
)
61 const CharsetMatch
**csm_l
= (const CharsetMatch
**) left
;
62 const CharsetMatch
**csm_r
= (const CharsetMatch
**) right
;
64 // NOTE: compare is backwards to sort from highest to lowest.
65 return (*csm_r
)->getConfidence() - (*csm_l
)->getConfidence();
72 void CharsetDetector::setRecognizers(UErrorCode
&status
)
75 CharsetRecognizer
**recognizers
;
77 if (U_FAILURE(status
)) {
81 UMTX_CHECK(NULL
, (UBool
) (fCSRecognizers
== NULL
), needsInit
);
84 CharsetRecognizer
*tempArray
[] = {
85 new CharsetRecog_UTF8(),
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
92 new CharsetRecog_8859_1_en(),
93 new CharsetRecog_8859_1_da(),
94 new CharsetRecog_8859_1_de(),
95 new CharsetRecog_8859_1_es(),
96 new CharsetRecog_8859_1_fr(),
97 new CharsetRecog_8859_1_it(),
98 new CharsetRecog_8859_1_nl(),
99 new CharsetRecog_8859_1_no(),
100 new CharsetRecog_8859_1_pt(),
101 new CharsetRecog_8859_1_sv(),
102 new CharsetRecog_8859_2_cs(),
103 new CharsetRecog_8859_2_hu(),
104 new CharsetRecog_8859_2_pl(),
105 new CharsetRecog_8859_2_ro(),
106 new CharsetRecog_8859_5_ru(),
107 new CharsetRecog_8859_6_ar(),
108 new CharsetRecog_8859_7_el(),
109 new CharsetRecog_8859_8_I_he(),
110 new CharsetRecog_8859_8_he(),
111 new CharsetRecog_windows_1251(),
112 new CharsetRecog_windows_1256(),
113 new CharsetRecog_KOI8_R(),
114 new CharsetRecog_8859_9_tr(),
115 new CharsetRecog_sjis(),
116 new CharsetRecog_gb_18030(),
117 new CharsetRecog_euc_jp(),
118 new CharsetRecog_euc_kr(),
119 new CharsetRecog_big5(),
121 new CharsetRecog_2022JP(),
122 new CharsetRecog_2022KR(),
123 new CharsetRecog_2022CN()
125 int32_t rCount
= ARRAY_SIZE(tempArray
);
128 recognizers
= NEW_ARRAY(CharsetRecognizer
*, rCount
);
130 if (recognizers
== NULL
) {
131 status
= U_MEMORY_ALLOCATION_ERROR
;
134 for (r
= 0; r
< rCount
; r
+= 1) {
135 recognizers
[r
] = tempArray
[r
];
137 if (recognizers
[r
] == NULL
) {
138 status
= U_MEMORY_ALLOCATION_ERROR
;
144 if (U_SUCCESS(status
)) {
146 if (fCSRecognizers
== NULL
) {
147 fCSRecognizers
= recognizers
;
148 fCSRecognizers_size
= rCount
;
153 if (fCSRecognizers
!= recognizers
) {
154 for (r
= 0; r
< rCount
; r
+= 1) {
155 delete recognizers
[r
];
156 recognizers
[r
] = NULL
;
159 DELETE_ARRAY(recognizers
);
163 ucln_i18n_registerCleanup(UCLN_I18N_CSDET
, csdet_cleanup
);
167 CharsetDetector::CharsetDetector(UErrorCode
&status
)
168 : textIn(new InputText(status
)), resultArray(NULL
),
169 resultCount(0), fStripTags(FALSE
), fFreshTextSet(FALSE
)
171 if (U_FAILURE(status
)) {
175 setRecognizers(status
);
177 if (U_FAILURE(status
)) {
181 resultArray
= (CharsetMatch
**)uprv_malloc(sizeof(CharsetMatch
*)*fCSRecognizers_size
);
183 if (resultArray
== NULL
) {
184 status
= U_MEMORY_ALLOCATION_ERROR
;
188 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
189 resultArray
[i
] = new CharsetMatch();
191 if (resultArray
[i
] == NULL
) {
192 status
= U_MEMORY_ALLOCATION_ERROR
;
198 CharsetDetector::~CharsetDetector()
202 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
203 delete resultArray
[i
];
206 uprv_free(resultArray
);
209 void CharsetDetector::setText(const char *in
, int32_t len
)
211 textIn
->setText(in
, len
);
212 fFreshTextSet
= TRUE
;
215 UBool
CharsetDetector::setStripTagsFlag(UBool flag
)
217 UBool temp
= fStripTags
;
219 fFreshTextSet
= TRUE
;
223 UBool
CharsetDetector::getStripTagsFlag() const
228 void CharsetDetector::setDeclaredEncoding(const char *encoding
, int32_t len
) const
230 textIn
->setDeclaredEncoding(encoding
,len
);
233 int32_t CharsetDetector::getDetectableCount()
235 UErrorCode status
= U_ZERO_ERROR
;
237 setRecognizers(status
);
239 return fCSRecognizers_size
;
242 const CharsetMatch
*CharsetDetector::detect(UErrorCode
&status
)
244 int32_t maxMatchesFound
= 0;
246 detectAll(maxMatchesFound
, status
);
248 if(maxMatchesFound
> 0) {
249 return resultArray
[0];
255 const CharsetMatch
* const *CharsetDetector::detectAll(int32_t &maxMatchesFound
, UErrorCode
&status
)
257 if(!textIn
->isSet()) {
258 status
= U_MISSING_RESOURCE_ERROR
;// TODO: Need to set proper status code for input text not set
261 } else if(fFreshTextSet
) {
262 CharsetRecognizer
*csr
;
263 int32_t detectResults
;
267 textIn
->MungeInput(fStripTags
);
269 // Iterate over all possible charsets, remember all that
270 // give a match quality > 0.
272 for (i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
273 csr
= fCSRecognizers
[i
];
274 detectResults
= csr
->match(textIn
);
275 confidence
= detectResults
;
277 if (confidence
> 0) {
278 resultArray
[resultCount
++]->set(textIn
, csr
, confidence
);
282 for(i
= resultCount
; i
< fCSRecognizers_size
; i
+= 1) {
283 resultArray
[i
]->set(textIn
, 0, 0);
286 uprv_sortArray(resultArray
, resultCount
, sizeof resultArray
[0], charsetMatchComparator
, NULL
, TRUE
, &status
);
288 //for(int32_t i = resultCount; i > 1; i -= 1) {
289 // for(int32_t j = 0; j < i-1; j += 1) {
290 // if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
291 // CharsetMatch *temp = resultArray[j];
292 // resultArray[j] = resultArray[j+1];
293 // resultArray[j+1] = temp;
298 fFreshTextSet
= FALSE
;
301 maxMatchesFound
= resultCount
;
306 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
308 if( index > fCSRecognizers_size-1 || index < 0) {
309 status = U_INDEX_OUTOFBOUNDS_ERROR;
313 return fCSRecognizers[index]->getName();
326 static void U_CALLCONV
327 enumClose(UEnumeration
*en
) {
328 if(en
->context
!= NULL
) {
329 DELETE_ARRAY(en
->context
);
335 static int32_t U_CALLCONV
336 enumCount(UEnumeration
*, UErrorCode
*) {
337 return fCSRecognizers_size
;
340 static const char* U_CALLCONV
341 enumNext(UEnumeration
*en
, int32_t *resultLength
, UErrorCode
* /*status*/) {
342 if(((Context
*)en
->context
)->currIndex
>= fCSRecognizers_size
) {
343 if(resultLength
!= NULL
) {
348 const char *currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->getName();
349 if(resultLength
!= NULL
) {
350 *resultLength
= (int32_t)uprv_strlen(currName
);
352 ((Context
*)en
->context
)->currIndex
++;
357 static void U_CALLCONV
358 enumReset(UEnumeration
*en
, UErrorCode
*) {
359 ((Context
*)en
->context
)->currIndex
= 0;
362 static const UEnumeration gCSDetEnumeration
= {
372 U_CAPI UEnumeration
* U_EXPORT2
373 ucsdet_getAllDetectableCharsets(const UCharsetDetector
*ucsd
, UErrorCode
*status
)
377 if(U_FAILURE(*status
)) {
381 /* Initialize recognized charsets. */
382 CharsetDetector::getDetectableCount();
384 UEnumeration
*en
= NEW_ARRAY(UEnumeration
, 1);
385 memcpy(en
, &gCSDetEnumeration
, sizeof(UEnumeration
));
386 en
->context
= (void*)NEW_ARRAY(Context
, 1);
387 uprv_memset(en
->context
, 0, sizeof(Context
));