]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csdetect.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / csdetect.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "unicode/ucsdet.h"
13
14 #include "csdetect.h"
15 #include "csmatch.h"
16 #include "uenumimp.h"
17
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "umutex.h"
21 #include "ucln_in.h"
22 #include "uarrsort.h"
23 #include "inputext.h"
24 #include "csrsbcs.h"
25 #include "csrmbcs.h"
26 #include "csrutf8.h"
27 #include "csrucode.h"
28 #include "csr2022.h"
29
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35 U_CDECL_BEGIN
36 static CharsetRecognizer **fCSRecognizers = NULL;
37
38 static int32_t fCSRecognizers_size = 0;
39
40 static UBool U_CALLCONV csdet_cleanup(void)
41 {
42 if (fCSRecognizers != NULL) {
43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44 delete fCSRecognizers[r];
45 fCSRecognizers[r] = NULL;
46 }
47
48 DELETE_ARRAY(fCSRecognizers);
49 fCSRecognizers = NULL;
50 fCSRecognizers_size = 0;
51 }
52
53 return TRUE;
54 }
55
56 static int32_t U_CALLCONV
57 charsetMatchComparator(const void *context, const void *left, const void *right)
58 {
59 const CharsetMatch **csm_l = (const CharsetMatch **) left;
60 const CharsetMatch **csm_r = (const CharsetMatch **) right;
61
62 // NOTE: compare is backwards to sort from highest to lowest.
63 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
64 }
65
66 U_CDECL_END
67
68 U_NAMESPACE_BEGIN
69
70 void CharsetDetector::setRecognizers(UErrorCode &status)
71 {
72 UBool needsInit;
73 CharsetRecognizer **recognizers;
74
75 if (U_FAILURE(status)) {
76 return;
77 }
78
79 umtx_lock(NULL);
80 needsInit = (UBool) (fCSRecognizers == NULL);
81 umtx_unlock(NULL);
82
83 if (needsInit) {
84 CharsetRecognizer *tempArray[] = {
85 new CharsetRecog_UTF8(),
86
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
91
92 new CharsetRecog_8859_1_en(),
93 new CharsetRecog_8859_1_da(),
94 new CharsetRecog_8859_1_de(),
95 new CharsetRecog_8859_1_es(),
96 new CharsetRecog_8859_1_fr(),
97 new CharsetRecog_8859_1_it(),
98 new CharsetRecog_8859_1_nl(),
99 new CharsetRecog_8859_1_no(),
100 new CharsetRecog_8859_1_pt(),
101 new CharsetRecog_8859_1_sv(),
102 new CharsetRecog_8859_2_cs(),
103 new CharsetRecog_8859_2_hu(),
104 new CharsetRecog_8859_2_pl(),
105 new CharsetRecog_8859_2_ro(),
106 new CharsetRecog_8859_5_ru(),
107 new CharsetRecog_8859_6_ar(),
108 new CharsetRecog_8859_7_el(),
109 new CharsetRecog_8859_8_I_he(),
110 new CharsetRecog_8859_8_he(),
111 new CharsetRecog_windows_1251(),
112 new CharsetRecog_windows_1256(),
113 new CharsetRecog_KOI8_R(),
114 new CharsetRecog_8859_9_tr(),
115 new CharsetRecog_sjis(),
116 new CharsetRecog_gb_18030(),
117 new CharsetRecog_euc_jp(),
118 new CharsetRecog_euc_kr(),
119 new CharsetRecog_big5(),
120
121 new CharsetRecog_2022JP(),
122 new CharsetRecog_2022KR(),
123 new CharsetRecog_2022CN()
124 };
125 int32_t rCount = ARRAY_SIZE(tempArray);
126 int32_t r;
127
128 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
129
130 if (recognizers == NULL) {
131 status = U_MEMORY_ALLOCATION_ERROR;
132 } else {
133 for (r = 0; r < rCount; r += 1) {
134 recognizers[r] = tempArray[r];
135
136 if (recognizers[r] == NULL) {
137 status = U_MEMORY_ALLOCATION_ERROR;
138 break;
139 }
140 }
141 }
142
143 if (U_SUCCESS(status)) {
144 umtx_lock(NULL);
145 if (fCSRecognizers == NULL) {
146 fCSRecognizers = recognizers;
147 fCSRecognizers_size = rCount;
148 }
149 umtx_unlock(NULL);
150 }
151
152 if (fCSRecognizers != recognizers) {
153 for (r = 0; r < rCount; r += 1) {
154 delete recognizers[r];
155 recognizers[r] = NULL;
156 }
157
158 DELETE_ARRAY(recognizers);
159 }
160
161 recognizers = NULL;
162 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
163 }
164 }
165
166 CharsetDetector::CharsetDetector(UErrorCode &status)
167 : textIn(new InputText()), resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
168 {
169 if (U_FAILURE(status)) {
170 return;
171 }
172
173 setRecognizers(status);
174
175 if (U_FAILURE(status)) {
176 return;
177 }
178
179 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
180
181 if (resultArray == NULL) {
182 status = U_MEMORY_ALLOCATION_ERROR;
183 return;
184 }
185
186 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
187 resultArray[i] = new CharsetMatch();
188
189 if (resultArray[i] == NULL) {
190 status = U_MEMORY_ALLOCATION_ERROR;
191 break;
192 }
193 }
194 }
195
196 CharsetDetector::~CharsetDetector()
197 {
198 delete textIn;
199
200 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
201 delete resultArray[i];
202 }
203
204 uprv_free(resultArray);
205 }
206
207 void CharsetDetector::setText(const char *in, int32_t len)
208 {
209 textIn->setText(in, len);
210 fFreshTextSet = TRUE;
211 }
212
213 UBool CharsetDetector::setStripTagsFlag(UBool flag)
214 {
215 UBool temp = fStripTags;
216 fStripTags = flag;
217 fFreshTextSet = TRUE;
218 return temp;
219 }
220
221 UBool CharsetDetector::getStripTagsFlag() const
222 {
223 return fStripTags;
224 }
225
226 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
227 {
228 textIn->setDeclaredEncoding(encoding,len);
229 }
230
231 int32_t CharsetDetector::getDetectableCount()
232 {
233 UErrorCode status = U_ZERO_ERROR;
234
235 setRecognizers(status);
236
237 return fCSRecognizers_size;
238 }
239
240 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
241 {
242 int32_t maxMatchesFound = 0;
243
244 detectAll(maxMatchesFound, status);
245
246 if(maxMatchesFound > 0) {
247 return resultArray[0];
248 } else {
249 return NULL;
250 }
251 }
252
253 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
254 {
255 if(!textIn->isSet()) {
256 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
257
258 return NULL;
259 } else if(fFreshTextSet) {
260 CharsetRecognizer *csr;
261 int32_t detectResults;
262 int32_t confidence;
263
264 textIn->MungeInput(fStripTags);
265
266 // Iterate over all possible charsets, remember all that
267 // give a match quality > 0.
268 resultCount = 0;
269 for (int32_t i = 0; i < fCSRecognizers_size; i += 1) {
270 csr = fCSRecognizers[i];
271 detectResults = csr->match(textIn);
272 confidence = detectResults;
273
274 if (confidence > 0) {
275 resultArray[resultCount++]->set(textIn, csr, confidence);
276 }
277 }
278
279 for(int32_t i = resultCount; i < fCSRecognizers_size; i += 1) {
280 resultArray[i]->set(textIn, 0, 0);
281 }
282
283 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
284 ////Bubble sort
285 //for(int32_t i = resultCount; i > 1; i -= 1) {
286 // for(int32_t j = 0; j < i-1; j += 1) {
287 // if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
288 // CharsetMatch *temp = resultArray[j];
289 // resultArray[j] = resultArray[j+1];
290 // resultArray[j+1] = temp;
291 // }
292 // }
293 //}
294
295 fFreshTextSet = FALSE;
296 }
297
298 maxMatchesFound = resultCount;
299
300 return resultArray;
301 }
302
303 const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
304 {
305 if( index > fCSRecognizers_size-1 || index < 0) {
306 status = U_INDEX_OUTOFBOUNDS_ERROR;
307
308 return 0;
309 } else {
310 return fCSRecognizers[index]->getName();
311 }
312 }
313
314 U_NAMESPACE_END
315
316 U_CDECL_BEGIN
317 typedef struct {
318 int32_t currIndex;
319 } Context;
320
321
322
323 static void U_CALLCONV
324 enumClose(UEnumeration *en) {
325 if(en->context != NULL) {
326 DELETE_ARRAY(en->context);
327 }
328
329 DELETE_ARRAY(en);
330 }
331
332 static int32_t U_CALLCONV
333 enumCount(UEnumeration *, UErrorCode *) {
334 return fCSRecognizers_size;
335 }
336
337 static const char* U_CALLCONV
338 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode *status) {
339 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
340 if(resultLength != NULL) {
341 *resultLength = 0;
342 }
343 return NULL;
344 }
345 const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
346 if(resultLength != NULL) {
347 *resultLength = (int32_t)uprv_strlen(currName);
348 }
349 ((Context *)en->context)->currIndex++;
350
351 return currName;
352 }
353
354 static void U_CALLCONV
355 enumReset(UEnumeration *en, UErrorCode *) {
356 ((Context *)en->context)->currIndex = 0;
357 }
358
359 static const UEnumeration gCSDetEnumeration = {
360 NULL,
361 NULL,
362 enumClose,
363 enumCount,
364 uenum_unextDefault,
365 enumNext,
366 enumReset
367 };
368
369 U_CAPI UEnumeration * U_EXPORT2
370 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status)
371 {
372 if(U_FAILURE(*status)) {
373 return 0;
374 }
375
376 /* Initialize recognized charsets. */
377 CharsetDetector::getDetectableCount();
378
379 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
380 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
381 en->context = (void*)NEW_ARRAY(Context, 1);
382 uprv_memset(en->context, 0, sizeof(Context));
383 return en;
384 }
385 U_CDECL_END
386
387 #endif