]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csdetect.cpp
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / i18n / csdetect.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "unicode/ucsdet.h"
13
14 #include "csdetect.h"
15 #include "csmatch.h"
16 #include "uenumimp.h"
17
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "umutex.h"
21 #include "ucln_in.h"
22 #include "uarrsort.h"
23 #include "inputext.h"
24 #include "csrsbcs.h"
25 #include "csrmbcs.h"
26 #include "csrutf8.h"
27 #include "csrucode.h"
28 #include "csr2022.h"
29
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35 U_CDECL_BEGIN
36 static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
37
38 static int32_t fCSRecognizers_size = 0;
39
40 static UBool U_CALLCONV csdet_cleanup(void)
41 {
42 if (fCSRecognizers != NULL) {
43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44 delete fCSRecognizers[r];
45 fCSRecognizers[r] = NULL;
46 }
47
48 DELETE_ARRAY(fCSRecognizers);
49 fCSRecognizers = NULL;
50 fCSRecognizers_size = 0;
51 }
52
53 return TRUE;
54 }
55
56 static int32_t U_CALLCONV
57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58 {
59 U_NAMESPACE_USE
60
61 const CharsetMatch **csm_l = (const CharsetMatch **) left;
62 const CharsetMatch **csm_r = (const CharsetMatch **) right;
63
64 // NOTE: compare is backwards to sort from highest to lowest.
65 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66 }
67
68 U_CDECL_END
69
70 U_NAMESPACE_BEGIN
71
72 void CharsetDetector::setRecognizers(UErrorCode &status)
73 {
74 UBool needsInit;
75 CharsetRecognizer **recognizers;
76
77 if (U_FAILURE(status)) {
78 return;
79 }
80
81 UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82
83 if (needsInit) {
84 CharsetRecognizer *tempArray[] = {
85 new CharsetRecog_UTF8(),
86
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
91
92 new CharsetRecog_8859_1_en(),
93 new CharsetRecog_8859_1_da(),
94 new CharsetRecog_8859_1_de(),
95 new CharsetRecog_8859_1_es(),
96 new CharsetRecog_8859_1_fr(),
97 new CharsetRecog_8859_1_it(),
98 new CharsetRecog_8859_1_nl(),
99 new CharsetRecog_8859_1_no(),
100 new CharsetRecog_8859_1_pt(),
101 new CharsetRecog_8859_1_sv(),
102 new CharsetRecog_8859_2_cs(),
103 new CharsetRecog_8859_2_hu(),
104 new CharsetRecog_8859_2_pl(),
105 new CharsetRecog_8859_2_ro(),
106 new CharsetRecog_8859_5_ru(),
107 new CharsetRecog_8859_6_ar(),
108 new CharsetRecog_8859_7_el(),
109 new CharsetRecog_8859_8_I_he(),
110 new CharsetRecog_8859_8_he(),
111 new CharsetRecog_windows_1251(),
112 new CharsetRecog_windows_1256(),
113 new CharsetRecog_KOI8_R(),
114 new CharsetRecog_8859_9_tr(),
115 new CharsetRecog_sjis(),
116 new CharsetRecog_gb_18030(),
117 new CharsetRecog_euc_jp(),
118 new CharsetRecog_euc_kr(),
119 new CharsetRecog_big5(),
120
121 new CharsetRecog_2022JP(),
122 new CharsetRecog_2022KR(),
123 new CharsetRecog_2022CN()
124 };
125 int32_t rCount = ARRAY_SIZE(tempArray);
126 int32_t r;
127
128 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
129
130 if (recognizers == NULL) {
131 status = U_MEMORY_ALLOCATION_ERROR;
132 return;
133 } else {
134 for (r = 0; r < rCount; r += 1) {
135 recognizers[r] = tempArray[r];
136
137 if (recognizers[r] == NULL) {
138 status = U_MEMORY_ALLOCATION_ERROR;
139 break;
140 }
141 }
142 }
143
144 if (U_SUCCESS(status)) {
145 umtx_lock(NULL);
146 if (fCSRecognizers == NULL) {
147 fCSRecognizers = recognizers;
148 fCSRecognizers_size = rCount;
149 }
150 umtx_unlock(NULL);
151 }
152
153 if (fCSRecognizers != recognizers) {
154 for (r = 0; r < rCount; r += 1) {
155 delete recognizers[r];
156 recognizers[r] = NULL;
157 }
158
159 DELETE_ARRAY(recognizers);
160 }
161
162 recognizers = NULL;
163 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
164 }
165 }
166
167 CharsetDetector::CharsetDetector(UErrorCode &status)
168 : textIn(new InputText(status)), resultArray(NULL),
169 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
170 {
171 if (U_FAILURE(status)) {
172 return;
173 }
174
175 setRecognizers(status);
176
177 if (U_FAILURE(status)) {
178 return;
179 }
180
181 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
182
183 if (resultArray == NULL) {
184 status = U_MEMORY_ALLOCATION_ERROR;
185 return;
186 }
187
188 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
189 resultArray[i] = new CharsetMatch();
190
191 if (resultArray[i] == NULL) {
192 status = U_MEMORY_ALLOCATION_ERROR;
193 break;
194 }
195 }
196 }
197
198 CharsetDetector::~CharsetDetector()
199 {
200 delete textIn;
201
202 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
203 delete resultArray[i];
204 }
205
206 uprv_free(resultArray);
207 }
208
209 void CharsetDetector::setText(const char *in, int32_t len)
210 {
211 textIn->setText(in, len);
212 fFreshTextSet = TRUE;
213 }
214
215 UBool CharsetDetector::setStripTagsFlag(UBool flag)
216 {
217 UBool temp = fStripTags;
218 fStripTags = flag;
219 fFreshTextSet = TRUE;
220 return temp;
221 }
222
223 UBool CharsetDetector::getStripTagsFlag() const
224 {
225 return fStripTags;
226 }
227
228 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
229 {
230 textIn->setDeclaredEncoding(encoding,len);
231 }
232
233 int32_t CharsetDetector::getDetectableCount()
234 {
235 UErrorCode status = U_ZERO_ERROR;
236
237 setRecognizers(status);
238
239 return fCSRecognizers_size;
240 }
241
242 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
243 {
244 int32_t maxMatchesFound = 0;
245
246 detectAll(maxMatchesFound, status);
247
248 if(maxMatchesFound > 0) {
249 return resultArray[0];
250 } else {
251 return NULL;
252 }
253 }
254
255 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
256 {
257 if(!textIn->isSet()) {
258 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
259
260 return NULL;
261 } else if(fFreshTextSet) {
262 CharsetRecognizer *csr;
263 int32_t detectResults;
264 int32_t confidence;
265 int32_t i;
266
267 textIn->MungeInput(fStripTags);
268
269 // Iterate over all possible charsets, remember all that
270 // give a match quality > 0.
271 resultCount = 0;
272 for (i = 0; i < fCSRecognizers_size; i += 1) {
273 csr = fCSRecognizers[i];
274 detectResults = csr->match(textIn);
275 confidence = detectResults;
276
277 if (confidence > 0) {
278 resultArray[resultCount++]->set(textIn, csr, confidence);
279 }
280 }
281
282 for(i = resultCount; i < fCSRecognizers_size; i += 1) {
283 resultArray[i]->set(textIn, 0, 0);
284 }
285
286 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
287 ////Bubble sort
288 //for(int32_t i = resultCount; i > 1; i -= 1) {
289 // for(int32_t j = 0; j < i-1; j += 1) {
290 // if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
291 // CharsetMatch *temp = resultArray[j];
292 // resultArray[j] = resultArray[j+1];
293 // resultArray[j+1] = temp;
294 // }
295 // }
296 //}
297
298 fFreshTextSet = FALSE;
299 }
300
301 maxMatchesFound = resultCount;
302
303 return resultArray;
304 }
305
306 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
307 {
308 if( index > fCSRecognizers_size-1 || index < 0) {
309 status = U_INDEX_OUTOFBOUNDS_ERROR;
310
311 return 0;
312 } else {
313 return fCSRecognizers[index]->getName();
314 }
315 }*/
316
317 U_NAMESPACE_END
318
319 U_CDECL_BEGIN
320 typedef struct {
321 int32_t currIndex;
322 } Context;
323
324
325
326 static void U_CALLCONV
327 enumClose(UEnumeration *en) {
328 if(en->context != NULL) {
329 DELETE_ARRAY(en->context);
330 }
331
332 DELETE_ARRAY(en);
333 }
334
335 static int32_t U_CALLCONV
336 enumCount(UEnumeration *, UErrorCode *) {
337 return fCSRecognizers_size;
338 }
339
340 static const char* U_CALLCONV
341 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
342 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
343 if(resultLength != NULL) {
344 *resultLength = 0;
345 }
346 return NULL;
347 }
348 const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
349 if(resultLength != NULL) {
350 *resultLength = (int32_t)uprv_strlen(currName);
351 }
352 ((Context *)en->context)->currIndex++;
353
354 return currName;
355 }
356
357 static void U_CALLCONV
358 enumReset(UEnumeration *en, UErrorCode *) {
359 ((Context *)en->context)->currIndex = 0;
360 }
361
362 static const UEnumeration gCSDetEnumeration = {
363 NULL,
364 NULL,
365 enumClose,
366 enumCount,
367 uenum_unextDefault,
368 enumNext,
369 enumReset
370 };
371
372 U_CAPI UEnumeration * U_EXPORT2
373 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status)
374 {
375 U_NAMESPACE_USE
376
377 if(U_FAILURE(*status)) {
378 return 0;
379 }
380
381 /* Initialize recognized charsets. */
382 CharsetDetector::getDetectableCount();
383
384 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
385 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
386 en->context = (void*)NEW_ARRAY(Context, 1);
387 uprv_memset(en->context, 0, sizeof(Context));
388 return en;
389 }
390 U_CDECL_END
391
392 #endif
393