2 **********************************************************************
3 * Copyright (C) 2005-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
12 #include "unicode/ucsdet.h"
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
36 static CharsetRecognizer
**fCSRecognizers
= NULL
;
38 static int32_t fCSRecognizers_size
= 0;
40 static UBool U_CALLCONV
csdet_cleanup(void)
42 if (fCSRecognizers
!= NULL
) {
43 for(int32_t r
= 0; r
< fCSRecognizers_size
; r
+= 1) {
44 delete fCSRecognizers
[r
];
45 fCSRecognizers
[r
] = NULL
;
48 DELETE_ARRAY(fCSRecognizers
);
49 fCSRecognizers
= NULL
;
50 fCSRecognizers_size
= 0;
56 static int32_t U_CALLCONV
57 charsetMatchComparator(const void *context
, const void *left
, const void *right
)
59 const CharsetMatch
**csm_l
= (const CharsetMatch
**) left
;
60 const CharsetMatch
**csm_r
= (const CharsetMatch
**) right
;
62 // NOTE: compare is backwards to sort from highest to lowest.
63 return (*csm_r
)->getConfidence() - (*csm_l
)->getConfidence();
70 void CharsetDetector::setRecognizers(UErrorCode
&status
)
73 CharsetRecognizer
**recognizers
;
75 if (U_FAILURE(status
)) {
80 needsInit
= (UBool
) (fCSRecognizers
== NULL
);
84 CharsetRecognizer
*tempArray
[] = {
85 new CharsetRecog_UTF8(),
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
92 new CharsetRecog_8859_1_en(),
93 new CharsetRecog_8859_1_da(),
94 new CharsetRecog_8859_1_de(),
95 new CharsetRecog_8859_1_es(),
96 new CharsetRecog_8859_1_fr(),
97 new CharsetRecog_8859_1_it(),
98 new CharsetRecog_8859_1_nl(),
99 new CharsetRecog_8859_1_no(),
100 new CharsetRecog_8859_1_pt(),
101 new CharsetRecog_8859_1_sv(),
102 new CharsetRecog_8859_2_cs(),
103 new CharsetRecog_8859_2_hu(),
104 new CharsetRecog_8859_2_pl(),
105 new CharsetRecog_8859_2_ro(),
106 new CharsetRecog_8859_5_ru(),
107 new CharsetRecog_8859_6_ar(),
108 new CharsetRecog_8859_7_el(),
109 new CharsetRecog_8859_8_I_he(),
110 new CharsetRecog_8859_8_he(),
111 new CharsetRecog_windows_1251(),
112 new CharsetRecog_windows_1256(),
113 new CharsetRecog_KOI8_R(),
114 new CharsetRecog_8859_9_tr(),
115 new CharsetRecog_sjis(),
116 new CharsetRecog_gb_18030(),
117 new CharsetRecog_euc_jp(),
118 new CharsetRecog_euc_kr(),
119 new CharsetRecog_big5(),
121 new CharsetRecog_2022JP(),
122 new CharsetRecog_2022KR(),
123 new CharsetRecog_2022CN()
125 int32_t rCount
= ARRAY_SIZE(tempArray
);
128 recognizers
= NEW_ARRAY(CharsetRecognizer
*, rCount
);
130 if (recognizers
== NULL
) {
131 status
= U_MEMORY_ALLOCATION_ERROR
;
133 for (r
= 0; r
< rCount
; r
+= 1) {
134 recognizers
[r
] = tempArray
[r
];
136 if (recognizers
[r
] == NULL
) {
137 status
= U_MEMORY_ALLOCATION_ERROR
;
143 if (U_SUCCESS(status
)) {
145 if (fCSRecognizers
== NULL
) {
146 fCSRecognizers
= recognizers
;
147 fCSRecognizers_size
= rCount
;
152 if (fCSRecognizers
!= recognizers
) {
153 for (r
= 0; r
< rCount
; r
+= 1) {
154 delete recognizers
[r
];
155 recognizers
[r
] = NULL
;
158 DELETE_ARRAY(recognizers
);
162 ucln_i18n_registerCleanup(UCLN_I18N_CSDET
, csdet_cleanup
);
166 CharsetDetector::CharsetDetector(UErrorCode
&status
)
167 : textIn(new InputText()), resultCount(0), fStripTags(FALSE
), fFreshTextSet(FALSE
)
169 if (U_FAILURE(status
)) {
173 setRecognizers(status
);
175 if (U_FAILURE(status
)) {
179 resultArray
= (CharsetMatch
**)uprv_malloc(sizeof(CharsetMatch
*)*fCSRecognizers_size
);
181 if (resultArray
== NULL
) {
182 status
= U_MEMORY_ALLOCATION_ERROR
;
186 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
187 resultArray
[i
] = new CharsetMatch();
189 if (resultArray
[i
] == NULL
) {
190 status
= U_MEMORY_ALLOCATION_ERROR
;
196 CharsetDetector::~CharsetDetector()
200 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
201 delete resultArray
[i
];
204 uprv_free(resultArray
);
207 void CharsetDetector::setText(const char *in
, int32_t len
)
209 textIn
->setText(in
, len
);
210 fFreshTextSet
= TRUE
;
213 UBool
CharsetDetector::setStripTagsFlag(UBool flag
)
215 UBool temp
= fStripTags
;
217 fFreshTextSet
= TRUE
;
221 UBool
CharsetDetector::getStripTagsFlag() const
226 void CharsetDetector::setDeclaredEncoding(const char *encoding
, int32_t len
) const
228 textIn
->setDeclaredEncoding(encoding
,len
);
231 int32_t CharsetDetector::getDetectableCount()
233 UErrorCode status
= U_ZERO_ERROR
;
235 setRecognizers(status
);
237 return fCSRecognizers_size
;
240 const CharsetMatch
*CharsetDetector::detect(UErrorCode
&status
)
242 int32_t maxMatchesFound
= 0;
244 detectAll(maxMatchesFound
, status
);
246 if(maxMatchesFound
> 0) {
247 return resultArray
[0];
253 const CharsetMatch
* const *CharsetDetector::detectAll(int32_t &maxMatchesFound
, UErrorCode
&status
)
255 if(!textIn
->isSet()) {
256 status
= U_MISSING_RESOURCE_ERROR
;// TODO: Need to set proper status code for input text not set
259 } else if(fFreshTextSet
) {
260 CharsetRecognizer
*csr
;
261 int32_t detectResults
;
264 textIn
->MungeInput(fStripTags
);
266 // Iterate over all possible charsets, remember all that
267 // give a match quality > 0.
269 for (int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
270 csr
= fCSRecognizers
[i
];
271 detectResults
= csr
->match(textIn
);
272 confidence
= detectResults
;
274 if (confidence
> 0) {
275 resultArray
[resultCount
++]->set(textIn
, csr
, confidence
);
279 for(int32_t i
= resultCount
; i
< fCSRecognizers_size
; i
+= 1) {
280 resultArray
[i
]->set(textIn
, 0, 0);
283 uprv_sortArray(resultArray
, resultCount
, sizeof resultArray
[0], charsetMatchComparator
, NULL
, TRUE
, &status
);
285 //for(int32_t i = resultCount; i > 1; i -= 1) {
286 // for(int32_t j = 0; j < i-1; j += 1) {
287 // if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
288 // CharsetMatch *temp = resultArray[j];
289 // resultArray[j] = resultArray[j+1];
290 // resultArray[j+1] = temp;
295 fFreshTextSet
= FALSE
;
298 maxMatchesFound
= resultCount
;
303 const char *CharsetDetector::getCharsetName(int32_t index
, UErrorCode
&status
) const
305 if( index
> fCSRecognizers_size
-1 || index
< 0) {
306 status
= U_INDEX_OUTOFBOUNDS_ERROR
;
310 return fCSRecognizers
[index
]->getName();
323 static void U_CALLCONV
324 enumClose(UEnumeration
*en
) {
325 if(en
->context
!= NULL
) {
326 DELETE_ARRAY(en
->context
);
332 static int32_t U_CALLCONV
333 enumCount(UEnumeration
*, UErrorCode
*) {
334 return fCSRecognizers_size
;
337 static const char* U_CALLCONV
338 enumNext(UEnumeration
*en
, int32_t *resultLength
, UErrorCode
*status
) {
339 if(((Context
*)en
->context
)->currIndex
>= fCSRecognizers_size
) {
340 if(resultLength
!= NULL
) {
345 const char *currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->getName();
346 if(resultLength
!= NULL
) {
347 *resultLength
= (int32_t)uprv_strlen(currName
);
349 ((Context
*)en
->context
)->currIndex
++;
354 static void U_CALLCONV
355 enumReset(UEnumeration
*en
, UErrorCode
*) {
356 ((Context
*)en
->context
)->currIndex
= 0;
359 static const UEnumeration gCSDetEnumeration
= {
369 U_CAPI UEnumeration
* U_EXPORT2
370 ucsdet_getAllDetectableCharsets(const UCharsetDetector
*ucsd
, UErrorCode
*status
)
372 if(U_FAILURE(*status
)) {
376 /* Initialize recognized charsets. */
377 CharsetDetector::getDetectableCount();
379 UEnumeration
*en
= NEW_ARRAY(UEnumeration
, 1);
380 memcpy(en
, &gCSDetEnumeration
, sizeof(UEnumeration
));
381 en
->context
= (void*)NEW_ARRAY(Context
, 1);
382 uprv_memset(en
->context
, 0, sizeof(Context
));