]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csdetect.cpp
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / i18n / csdetect.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
46f4442e 3 * Copyright (C) 2005-2008, International Business Machines
73c04bcf
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "unicode/ucsdet.h"
13
14#include "csdetect.h"
15#include "csmatch.h"
16#include "uenumimp.h"
17
18#include "cmemory.h"
19#include "cstring.h"
20#include "umutex.h"
21#include "ucln_in.h"
22#include "uarrsort.h"
23#include "inputext.h"
24#include "csrsbcs.h"
25#include "csrmbcs.h"
26#include "csrutf8.h"
27#include "csrucode.h"
28#include "csr2022.h"
29
30#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35U_CDECL_BEGIN
46f4442e 36static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
73c04bcf
A
37
38static int32_t fCSRecognizers_size = 0;
39
40static UBool U_CALLCONV csdet_cleanup(void)
41{
42 if (fCSRecognizers != NULL) {
43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44 delete fCSRecognizers[r];
45 fCSRecognizers[r] = NULL;
46 }
47
48 DELETE_ARRAY(fCSRecognizers);
49 fCSRecognizers = NULL;
50 fCSRecognizers_size = 0;
51 }
52
53 return TRUE;
54}
55
56static int32_t U_CALLCONV
46f4442e 57charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
73c04bcf 58{
46f4442e
A
59 U_NAMESPACE_USE
60
73c04bcf
A
61 const CharsetMatch **csm_l = (const CharsetMatch **) left;
62 const CharsetMatch **csm_r = (const CharsetMatch **) right;
63
64 // NOTE: compare is backwards to sort from highest to lowest.
65 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66}
67
68U_CDECL_END
69
70U_NAMESPACE_BEGIN
71
72void CharsetDetector::setRecognizers(UErrorCode &status)
73{
74 UBool needsInit;
75 CharsetRecognizer **recognizers;
76
77 if (U_FAILURE(status)) {
78 return;
79 }
80
46f4442e 81 UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
73c04bcf
A
82
83 if (needsInit) {
84 CharsetRecognizer *tempArray[] = {
85 new CharsetRecog_UTF8(),
86
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
91
92 new CharsetRecog_8859_1_en(),
93 new CharsetRecog_8859_1_da(),
94 new CharsetRecog_8859_1_de(),
95 new CharsetRecog_8859_1_es(),
96 new CharsetRecog_8859_1_fr(),
97 new CharsetRecog_8859_1_it(),
98 new CharsetRecog_8859_1_nl(),
99 new CharsetRecog_8859_1_no(),
100 new CharsetRecog_8859_1_pt(),
101 new CharsetRecog_8859_1_sv(),
102 new CharsetRecog_8859_2_cs(),
103 new CharsetRecog_8859_2_hu(),
104 new CharsetRecog_8859_2_pl(),
105 new CharsetRecog_8859_2_ro(),
106 new CharsetRecog_8859_5_ru(),
107 new CharsetRecog_8859_6_ar(),
108 new CharsetRecog_8859_7_el(),
109 new CharsetRecog_8859_8_I_he(),
110 new CharsetRecog_8859_8_he(),
111 new CharsetRecog_windows_1251(),
112 new CharsetRecog_windows_1256(),
113 new CharsetRecog_KOI8_R(),
114 new CharsetRecog_8859_9_tr(),
115 new CharsetRecog_sjis(),
116 new CharsetRecog_gb_18030(),
117 new CharsetRecog_euc_jp(),
118 new CharsetRecog_euc_kr(),
119 new CharsetRecog_big5(),
120
121 new CharsetRecog_2022JP(),
122 new CharsetRecog_2022KR(),
123 new CharsetRecog_2022CN()
124 };
125 int32_t rCount = ARRAY_SIZE(tempArray);
126 int32_t r;
127
128 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
129
130 if (recognizers == NULL) {
131 status = U_MEMORY_ALLOCATION_ERROR;
46f4442e 132 return;
73c04bcf
A
133 } else {
134 for (r = 0; r < rCount; r += 1) {
135 recognizers[r] = tempArray[r];
136
137 if (recognizers[r] == NULL) {
138 status = U_MEMORY_ALLOCATION_ERROR;
139 break;
140 }
141 }
142 }
143
144 if (U_SUCCESS(status)) {
145 umtx_lock(NULL);
146 if (fCSRecognizers == NULL) {
147 fCSRecognizers = recognizers;
148 fCSRecognizers_size = rCount;
149 }
150 umtx_unlock(NULL);
151 }
152
153 if (fCSRecognizers != recognizers) {
154 for (r = 0; r < rCount; r += 1) {
155 delete recognizers[r];
156 recognizers[r] = NULL;
157 }
158
159 DELETE_ARRAY(recognizers);
160 }
161
162 recognizers = NULL;
163 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
164 }
165}
166
167CharsetDetector::CharsetDetector(UErrorCode &status)
46f4442e
A
168 : textIn(new InputText(status)), resultArray(NULL),
169 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
73c04bcf
A
170{
171 if (U_FAILURE(status)) {
172 return;
173 }
174
175 setRecognizers(status);
176
177 if (U_FAILURE(status)) {
178 return;
179 }
180
181 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
182
183 if (resultArray == NULL) {
184 status = U_MEMORY_ALLOCATION_ERROR;
185 return;
186 }
187
188 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
189 resultArray[i] = new CharsetMatch();
190
191 if (resultArray[i] == NULL) {
192 status = U_MEMORY_ALLOCATION_ERROR;
193 break;
194 }
195 }
196}
197
198CharsetDetector::~CharsetDetector()
199{
200 delete textIn;
201
202 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
203 delete resultArray[i];
204 }
205
206 uprv_free(resultArray);
207}
208
209void CharsetDetector::setText(const char *in, int32_t len)
210{
211 textIn->setText(in, len);
212 fFreshTextSet = TRUE;
213}
214
215UBool CharsetDetector::setStripTagsFlag(UBool flag)
216{
217 UBool temp = fStripTags;
218 fStripTags = flag;
219 fFreshTextSet = TRUE;
220 return temp;
221}
222
223UBool CharsetDetector::getStripTagsFlag() const
224{
225 return fStripTags;
226}
227
228void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
229{
230 textIn->setDeclaredEncoding(encoding,len);
231}
232
233int32_t CharsetDetector::getDetectableCount()
234{
235 UErrorCode status = U_ZERO_ERROR;
236
237 setRecognizers(status);
238
239 return fCSRecognizers_size;
240}
241
242const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
243{
244 int32_t maxMatchesFound = 0;
245
246 detectAll(maxMatchesFound, status);
247
248 if(maxMatchesFound > 0) {
249 return resultArray[0];
250 } else {
251 return NULL;
252 }
253}
254
255const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
256{
257 if(!textIn->isSet()) {
258 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
259
260 return NULL;
261 } else if(fFreshTextSet) {
262 CharsetRecognizer *csr;
263 int32_t detectResults;
264 int32_t confidence;
46f4442e 265 int32_t i;
73c04bcf
A
266
267 textIn->MungeInput(fStripTags);
268
269 // Iterate over all possible charsets, remember all that
270 // give a match quality > 0.
271 resultCount = 0;
46f4442e 272 for (i = 0; i < fCSRecognizers_size; i += 1) {
73c04bcf
A
273 csr = fCSRecognizers[i];
274 detectResults = csr->match(textIn);
275 confidence = detectResults;
276
277 if (confidence > 0) {
278 resultArray[resultCount++]->set(textIn, csr, confidence);
279 }
280 }
281
46f4442e 282 for(i = resultCount; i < fCSRecognizers_size; i += 1) {
73c04bcf
A
283 resultArray[i]->set(textIn, 0, 0);
284 }
285
286 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
287 ////Bubble sort
288 //for(int32_t i = resultCount; i > 1; i -= 1) {
289 // for(int32_t j = 0; j < i-1; j += 1) {
290 // if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
291 // CharsetMatch *temp = resultArray[j];
292 // resultArray[j] = resultArray[j+1];
293 // resultArray[j+1] = temp;
294 // }
295 // }
296 //}
297
298 fFreshTextSet = FALSE;
299 }
300
301 maxMatchesFound = resultCount;
302
303 return resultArray;
304}
305
46f4442e 306/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
73c04bcf
A
307{
308 if( index > fCSRecognizers_size-1 || index < 0) {
309 status = U_INDEX_OUTOFBOUNDS_ERROR;
310
311 return 0;
312 } else {
313 return fCSRecognizers[index]->getName();
314 }
46f4442e 315}*/
73c04bcf
A
316
317U_NAMESPACE_END
318
319U_CDECL_BEGIN
320typedef struct {
321 int32_t currIndex;
322} Context;
323
324
325
326static void U_CALLCONV
327enumClose(UEnumeration *en) {
328 if(en->context != NULL) {
329 DELETE_ARRAY(en->context);
330 }
331
332 DELETE_ARRAY(en);
333}
334
335static int32_t U_CALLCONV
336enumCount(UEnumeration *, UErrorCode *) {
337 return fCSRecognizers_size;
338}
339
340static const char* U_CALLCONV
46f4442e 341enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
73c04bcf
A
342 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
343 if(resultLength != NULL) {
344 *resultLength = 0;
345 }
346 return NULL;
347 }
348 const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
349 if(resultLength != NULL) {
350 *resultLength = (int32_t)uprv_strlen(currName);
351 }
352 ((Context *)en->context)->currIndex++;
353
354 return currName;
355}
356
357static void U_CALLCONV
358enumReset(UEnumeration *en, UErrorCode *) {
359 ((Context *)en->context)->currIndex = 0;
360}
361
362static const UEnumeration gCSDetEnumeration = {
363 NULL,
364 NULL,
365 enumClose,
366 enumCount,
367 uenum_unextDefault,
368 enumNext,
369 enumReset
370};
371
372U_CAPI UEnumeration * U_EXPORT2
46f4442e 373ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status)
73c04bcf 374{
46f4442e
A
375 U_NAMESPACE_USE
376
73c04bcf
A
377 if(U_FAILURE(*status)) {
378 return 0;
379 }
380
381 /* Initialize recognized charsets. */
382 CharsetDetector::getDetectableCount();
383
384 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
385 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
386 en->context = (void*)NEW_ARRAY(Context, 1);
387 uprv_memset(en->context, 0, sizeof(Context));
388 return en;
389}
390U_CDECL_END
391
392#endif
46f4442e 393