2 **********************************************************************
3 * Copyright (C) 2005-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
12 #include "unicode/ucsdet.h"
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
36 static icu::CharsetRecognizer
**fCSRecognizers
= NULL
;
38 static int32_t fCSRecognizers_size
= 0;
40 static UBool U_CALLCONV
csdet_cleanup(void)
42 if (fCSRecognizers
!= NULL
) {
43 for(int32_t r
= 0; r
< fCSRecognizers_size
; r
+= 1) {
44 delete fCSRecognizers
[r
];
45 fCSRecognizers
[r
] = NULL
;
48 DELETE_ARRAY(fCSRecognizers
);
49 fCSRecognizers
= NULL
;
50 fCSRecognizers_size
= 0;
56 static int32_t U_CALLCONV
57 charsetMatchComparator(const void * /*context*/, const void *left
, const void *right
)
61 const CharsetMatch
**csm_l
= (const CharsetMatch
**) left
;
62 const CharsetMatch
**csm_r
= (const CharsetMatch
**) right
;
64 // NOTE: compare is backwards to sort from highest to lowest.
65 return (*csm_r
)->getConfidence() - (*csm_l
)->getConfidence();
72 void CharsetDetector::setRecognizers(UErrorCode
&status
)
75 CharsetRecognizer
**recognizers
;
77 if (U_FAILURE(status
)) {
81 UMTX_CHECK(NULL
, (UBool
) (fCSRecognizers
== NULL
), needsInit
);
84 CharsetRecognizer
*tempArray
[] = {
85 new CharsetRecog_UTF8(),
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
92 new CharsetRecog_8859_1(),
93 new CharsetRecog_8859_2(),
94 new CharsetRecog_8859_5_ru(),
95 new CharsetRecog_8859_6_ar(),
96 new CharsetRecog_8859_7_el(),
97 new CharsetRecog_8859_8_I_he(),
98 new CharsetRecog_8859_8_he(),
99 new CharsetRecog_windows_1251(),
100 new CharsetRecog_windows_1256(),
101 new CharsetRecog_KOI8_R(),
102 new CharsetRecog_8859_9_tr(),
103 new CharsetRecog_sjis(),
104 new CharsetRecog_gb_18030(),
105 new CharsetRecog_euc_jp(),
106 new CharsetRecog_euc_kr(),
107 new CharsetRecog_big5(),
109 new CharsetRecog_2022JP(),
110 new CharsetRecog_2022KR(),
111 new CharsetRecog_2022CN(),
113 new CharsetRecog_IBM424_he_rtl(),
114 new CharsetRecog_IBM424_he_ltr(),
115 new CharsetRecog_IBM420_ar_rtl(),
116 new CharsetRecog_IBM420_ar_ltr()
118 int32_t rCount
= ARRAY_SIZE(tempArray
);
121 recognizers
= NEW_ARRAY(CharsetRecognizer
*, rCount
);
123 if (recognizers
== NULL
) {
124 status
= U_MEMORY_ALLOCATION_ERROR
;
127 for (r
= 0; r
< rCount
; r
+= 1) {
128 recognizers
[r
] = tempArray
[r
];
130 if (recognizers
[r
] == NULL
) {
131 status
= U_MEMORY_ALLOCATION_ERROR
;
137 if (U_SUCCESS(status
)) {
139 if (fCSRecognizers
== NULL
) {
140 fCSRecognizers_size
= rCount
;
141 fCSRecognizers
= recognizers
;
146 if (fCSRecognizers
!= recognizers
) {
147 for (r
= 0; r
< rCount
; r
+= 1) {
148 delete recognizers
[r
];
149 recognizers
[r
] = NULL
;
152 DELETE_ARRAY(recognizers
);
156 ucln_i18n_registerCleanup(UCLN_I18N_CSDET
, csdet_cleanup
);
160 CharsetDetector::CharsetDetector(UErrorCode
&status
)
161 : textIn(new InputText(status
)), resultArray(NULL
),
162 resultCount(0), fStripTags(FALSE
), fFreshTextSet(FALSE
)
164 if (U_FAILURE(status
)) {
168 setRecognizers(status
);
170 if (U_FAILURE(status
)) {
174 resultArray
= (CharsetMatch
**)uprv_malloc(sizeof(CharsetMatch
*)*fCSRecognizers_size
);
176 if (resultArray
== NULL
) {
177 status
= U_MEMORY_ALLOCATION_ERROR
;
181 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
182 resultArray
[i
] = new CharsetMatch();
184 if (resultArray
[i
] == NULL
) {
185 status
= U_MEMORY_ALLOCATION_ERROR
;
191 CharsetDetector::~CharsetDetector()
195 for(int32_t i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
196 delete resultArray
[i
];
199 uprv_free(resultArray
);
202 void CharsetDetector::setText(const char *in
, int32_t len
)
204 textIn
->setText(in
, len
);
205 fFreshTextSet
= TRUE
;
208 UBool
CharsetDetector::setStripTagsFlag(UBool flag
)
210 UBool temp
= fStripTags
;
212 fFreshTextSet
= TRUE
;
216 UBool
CharsetDetector::getStripTagsFlag() const
221 void CharsetDetector::setDeclaredEncoding(const char *encoding
, int32_t len
) const
223 textIn
->setDeclaredEncoding(encoding
,len
);
226 int32_t CharsetDetector::getDetectableCount()
228 UErrorCode status
= U_ZERO_ERROR
;
230 setRecognizers(status
);
232 return fCSRecognizers_size
;
235 const CharsetMatch
*CharsetDetector::detect(UErrorCode
&status
)
237 int32_t maxMatchesFound
= 0;
239 detectAll(maxMatchesFound
, status
);
241 if(maxMatchesFound
> 0) {
242 return resultArray
[0];
248 const CharsetMatch
* const *CharsetDetector::detectAll(int32_t &maxMatchesFound
, UErrorCode
&status
)
250 if(!textIn
->isSet()) {
251 status
= U_MISSING_RESOURCE_ERROR
;// TODO: Need to set proper status code for input text not set
254 } else if (fFreshTextSet
) {
255 CharsetRecognizer
*csr
;
258 textIn
->MungeInput(fStripTags
);
260 // Iterate over all possible charsets, remember all that
261 // give a match quality > 0.
263 for (i
= 0; i
< fCSRecognizers_size
; i
+= 1) {
264 csr
= fCSRecognizers
[i
];
265 if (csr
->match(textIn
, resultArray
[resultCount
])) {
270 if (resultCount
> 1) {
271 uprv_sortArray(resultArray
, resultCount
, sizeof resultArray
[0], charsetMatchComparator
, NULL
, TRUE
, &status
);
273 fFreshTextSet
= FALSE
;
276 maxMatchesFound
= resultCount
;
281 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
283 if( index > fCSRecognizers_size-1 || index < 0) {
284 status = U_INDEX_OUTOFBOUNDS_ERROR;
288 return fCSRecognizers[index]->getName();
301 static void U_CALLCONV
302 enumClose(UEnumeration
*en
) {
303 if(en
->context
!= NULL
) {
304 DELETE_ARRAY(en
->context
);
310 static int32_t U_CALLCONV
311 enumCount(UEnumeration
*, UErrorCode
*) {
312 return fCSRecognizers_size
;
315 static const char* U_CALLCONV
316 enumNext(UEnumeration
*en
, int32_t *resultLength
, UErrorCode
* /*status*/) {
317 if(((Context
*)en
->context
)->currIndex
>= fCSRecognizers_size
) {
318 if(resultLength
!= NULL
) {
323 const char *currName
= fCSRecognizers
[((Context
*)en
->context
)->currIndex
]->getName();
324 if(resultLength
!= NULL
) {
325 *resultLength
= (int32_t)uprv_strlen(currName
);
327 ((Context
*)en
->context
)->currIndex
++;
332 static void U_CALLCONV
333 enumReset(UEnumeration
*en
, UErrorCode
*) {
334 ((Context
*)en
->context
)->currIndex
= 0;
337 static const UEnumeration gCSDetEnumeration
= {
347 U_CAPI UEnumeration
* U_EXPORT2
348 ucsdet_getAllDetectableCharsets(const UCharsetDetector
* /*ucsd*/, UErrorCode
*status
)
352 if(U_FAILURE(*status
)) {
356 /* Initialize recognized charsets. */
357 CharsetDetector::getDetectableCount();
359 UEnumeration
*en
= NEW_ARRAY(UEnumeration
, 1);
360 memcpy(en
, &gCSDetEnumeration
, sizeof(UEnumeration
));
361 en
->context
= (void*)NEW_ARRAY(Context
, 1);
362 uprv_memset(en
->context
, 0, sizeof(Context
));