]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csdetect.cpp
ICU-511.32.tar.gz
[apple/icu.git] / icuSources / i18n / csdetect.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
51004dcb 3 * Copyright (C) 2005-2012, International Business Machines
73c04bcf
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "unicode/ucsdet.h"
13
14#include "csdetect.h"
15#include "csmatch.h"
16#include "uenumimp.h"
17
18#include "cmemory.h"
19#include "cstring.h"
20#include "umutex.h"
21#include "ucln_in.h"
22#include "uarrsort.h"
23#include "inputext.h"
24#include "csrsbcs.h"
25#include "csrmbcs.h"
26#include "csrutf8.h"
27#include "csrucode.h"
28#include "csr2022.h"
29
30#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35U_CDECL_BEGIN
4388f060 36static icu::CharsetRecognizer **fCSRecognizers = NULL;
73c04bcf
A
37
38static int32_t fCSRecognizers_size = 0;
39
40static UBool U_CALLCONV csdet_cleanup(void)
41{
42 if (fCSRecognizers != NULL) {
43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44 delete fCSRecognizers[r];
45 fCSRecognizers[r] = NULL;
46 }
47
48 DELETE_ARRAY(fCSRecognizers);
49 fCSRecognizers = NULL;
50 fCSRecognizers_size = 0;
51 }
52
53 return TRUE;
54}
55
56static int32_t U_CALLCONV
46f4442e 57charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
73c04bcf 58{
46f4442e
A
59 U_NAMESPACE_USE
60
73c04bcf
A
61 const CharsetMatch **csm_l = (const CharsetMatch **) left;
62 const CharsetMatch **csm_r = (const CharsetMatch **) right;
63
64 // NOTE: compare is backwards to sort from highest to lowest.
65 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66}
67
68U_CDECL_END
69
70U_NAMESPACE_BEGIN
71
72void CharsetDetector::setRecognizers(UErrorCode &status)
73{
74 UBool needsInit;
75 CharsetRecognizer **recognizers;
76
77 if (U_FAILURE(status)) {
78 return;
79 }
80
46f4442e 81 UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
73c04bcf
A
82
83 if (needsInit) {
84 CharsetRecognizer *tempArray[] = {
85 new CharsetRecog_UTF8(),
86
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
91
51004dcb
A
92 new CharsetRecog_8859_1(),
93 new CharsetRecog_8859_2(),
73c04bcf
A
94 new CharsetRecog_8859_5_ru(),
95 new CharsetRecog_8859_6_ar(),
96 new CharsetRecog_8859_7_el(),
97 new CharsetRecog_8859_8_I_he(),
98 new CharsetRecog_8859_8_he(),
99 new CharsetRecog_windows_1251(),
100 new CharsetRecog_windows_1256(),
101 new CharsetRecog_KOI8_R(),
102 new CharsetRecog_8859_9_tr(),
103 new CharsetRecog_sjis(),
104 new CharsetRecog_gb_18030(),
105 new CharsetRecog_euc_jp(),
106 new CharsetRecog_euc_kr(),
107 new CharsetRecog_big5(),
108
109 new CharsetRecog_2022JP(),
110 new CharsetRecog_2022KR(),
729e4ab9
A
111 new CharsetRecog_2022CN(),
112
113 new CharsetRecog_IBM424_he_rtl(),
114 new CharsetRecog_IBM424_he_ltr(),
115 new CharsetRecog_IBM420_ar_rtl(),
116 new CharsetRecog_IBM420_ar_ltr()
73c04bcf
A
117 };
118 int32_t rCount = ARRAY_SIZE(tempArray);
119 int32_t r;
120
121 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
122
123 if (recognizers == NULL) {
124 status = U_MEMORY_ALLOCATION_ERROR;
46f4442e 125 return;
73c04bcf
A
126 } else {
127 for (r = 0; r < rCount; r += 1) {
128 recognizers[r] = tempArray[r];
129
130 if (recognizers[r] == NULL) {
131 status = U_MEMORY_ALLOCATION_ERROR;
132 break;
133 }
134 }
135 }
136
137 if (U_SUCCESS(status)) {
138 umtx_lock(NULL);
139 if (fCSRecognizers == NULL) {
73c04bcf 140 fCSRecognizers_size = rCount;
729e4ab9 141 fCSRecognizers = recognizers;
73c04bcf
A
142 }
143 umtx_unlock(NULL);
144 }
145
146 if (fCSRecognizers != recognizers) {
147 for (r = 0; r < rCount; r += 1) {
148 delete recognizers[r];
149 recognizers[r] = NULL;
150 }
151
152 DELETE_ARRAY(recognizers);
153 }
154
155 recognizers = NULL;
156 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
157 }
158}
159
160CharsetDetector::CharsetDetector(UErrorCode &status)
46f4442e
A
161 : textIn(new InputText(status)), resultArray(NULL),
162 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
73c04bcf
A
163{
164 if (U_FAILURE(status)) {
165 return;
166 }
167
168 setRecognizers(status);
169
170 if (U_FAILURE(status)) {
171 return;
172 }
173
174 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
175
176 if (resultArray == NULL) {
177 status = U_MEMORY_ALLOCATION_ERROR;
178 return;
179 }
180
181 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
182 resultArray[i] = new CharsetMatch();
183
184 if (resultArray[i] == NULL) {
185 status = U_MEMORY_ALLOCATION_ERROR;
186 break;
187 }
188 }
189}
190
191CharsetDetector::~CharsetDetector()
192{
193 delete textIn;
194
195 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
196 delete resultArray[i];
197 }
198
199 uprv_free(resultArray);
200}
201
202void CharsetDetector::setText(const char *in, int32_t len)
203{
204 textIn->setText(in, len);
205 fFreshTextSet = TRUE;
206}
207
208UBool CharsetDetector::setStripTagsFlag(UBool flag)
209{
210 UBool temp = fStripTags;
211 fStripTags = flag;
212 fFreshTextSet = TRUE;
213 return temp;
214}
215
216UBool CharsetDetector::getStripTagsFlag() const
217{
218 return fStripTags;
219}
220
221void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
222{
223 textIn->setDeclaredEncoding(encoding,len);
224}
225
226int32_t CharsetDetector::getDetectableCount()
227{
228 UErrorCode status = U_ZERO_ERROR;
229
230 setRecognizers(status);
231
232 return fCSRecognizers_size;
233}
234
235const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
236{
237 int32_t maxMatchesFound = 0;
238
239 detectAll(maxMatchesFound, status);
240
241 if(maxMatchesFound > 0) {
242 return resultArray[0];
243 } else {
244 return NULL;
245 }
246}
247
248const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
249{
250 if(!textIn->isSet()) {
251 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
252
253 return NULL;
51004dcb 254 } else if (fFreshTextSet) {
73c04bcf 255 CharsetRecognizer *csr;
46f4442e 256 int32_t i;
73c04bcf
A
257
258 textIn->MungeInput(fStripTags);
259
260 // Iterate over all possible charsets, remember all that
261 // give a match quality > 0.
262 resultCount = 0;
46f4442e 263 for (i = 0; i < fCSRecognizers_size; i += 1) {
73c04bcf 264 csr = fCSRecognizers[i];
51004dcb
A
265 if (csr->match(textIn, resultArray[resultCount])) {
266 resultCount++;
73c04bcf
A
267 }
268 }
269
51004dcb
A
270 if (resultCount > 1) {
271 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
729e4ab9 272 }
73c04bcf
A
273 fFreshTextSet = FALSE;
274 }
275
276 maxMatchesFound = resultCount;
277
278 return resultArray;
279}
280
46f4442e 281/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
73c04bcf
A
282{
283 if( index > fCSRecognizers_size-1 || index < 0) {
284 status = U_INDEX_OUTOFBOUNDS_ERROR;
285
286 return 0;
287 } else {
288 return fCSRecognizers[index]->getName();
289 }
46f4442e 290}*/
73c04bcf
A
291
292U_NAMESPACE_END
293
294U_CDECL_BEGIN
295typedef struct {
296 int32_t currIndex;
297} Context;
298
299
300
301static void U_CALLCONV
302enumClose(UEnumeration *en) {
303 if(en->context != NULL) {
304 DELETE_ARRAY(en->context);
305 }
306
307 DELETE_ARRAY(en);
308}
309
310static int32_t U_CALLCONV
311enumCount(UEnumeration *, UErrorCode *) {
312 return fCSRecognizers_size;
313}
314
315static const char* U_CALLCONV
46f4442e 316enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
73c04bcf
A
317 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
318 if(resultLength != NULL) {
319 *resultLength = 0;
320 }
321 return NULL;
322 }
323 const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
324 if(resultLength != NULL) {
325 *resultLength = (int32_t)uprv_strlen(currName);
326 }
327 ((Context *)en->context)->currIndex++;
328
329 return currName;
330}
331
332static void U_CALLCONV
333enumReset(UEnumeration *en, UErrorCode *) {
334 ((Context *)en->context)->currIndex = 0;
335}
336
337static const UEnumeration gCSDetEnumeration = {
338 NULL,
339 NULL,
340 enumClose,
341 enumCount,
342 uenum_unextDefault,
343 enumNext,
344 enumReset
345};
346
347U_CAPI UEnumeration * U_EXPORT2
729e4ab9 348ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
73c04bcf 349{
46f4442e
A
350 U_NAMESPACE_USE
351
73c04bcf
A
352 if(U_FAILURE(*status)) {
353 return 0;
354 }
355
356 /* Initialize recognized charsets. */
357 CharsetDetector::getDetectableCount();
358
359 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
360 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
361 en->context = (void*)NEW_ARRAY(Context, 1);
362 uprv_memset(en->context, 0, sizeof(Context));
363 return en;
364}
365U_CDECL_END
366
367#endif
46f4442e 368