icuSources/i18n/csdetect.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2006, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_CONVERSION
  11
  12 #include "unicode/ucsdet.h"
  13
  14 #include "csdetect.h"
  15 #include "csmatch.h"
  16 #include "uenumimp.h"
  17
  18 #include "cmemory.h"
  19 #include "cstring.h"
  20 #include "umutex.h"
  21 #include "ucln_in.h"
  22 #include "uarrsort.h"
  23 #include "inputext.h"
  24 #include "csrsbcs.h"
  25 #include "csrmbcs.h"
  26 #include "csrutf8.h"
  27 #include "csrucode.h"
  28 #include "csr2022.h"
  29
  30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  31
  32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
  33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
  34
  35 U_CDECL_BEGIN
  36 static CharsetRecognizer **fCSRecognizers = NULL;
  37
  38 static int32_t fCSRecognizers_size = 0;
  39
  40 static UBool U_CALLCONV csdet_cleanup(void)
  41 {
  42     if (fCSRecognizers != NULL) {
  43         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
  44             delete fCSRecognizers[r];
  45             fCSRecognizers[r] = NULL;
  46         }
  47
  48         DELETE_ARRAY(fCSRecognizers);
  49         fCSRecognizers = NULL;
  50         fCSRecognizers_size = 0;
  51     }
  52
  53     return TRUE;
  54 }
  55
  56 static int32_t U_CALLCONV
  57 charsetMatchComparator(const void *context, const void *left, const void *right)
  58 {
  59     const CharsetMatch **csm_l = (const CharsetMatch **) left;
  60     const CharsetMatch **csm_r = (const CharsetMatch **) right;
  61
  62     // NOTE: compare is backwards to sort from highest to lowest.
  63     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
  64 }
  65
  66 U_CDECL_END
  67
  68 U_NAMESPACE_BEGIN
  69
  70 void CharsetDetector::setRecognizers(UErrorCode &status)
  71 {
  72     UBool needsInit;
  73     CharsetRecognizer **recognizers;
  74
  75     if (U_FAILURE(status)) {
  76         return;
  77     }
  78
  79     umtx_lock(NULL);
  80     needsInit = (UBool) (fCSRecognizers == NULL);
  81     umtx_unlock(NULL);
  82
  83     if (needsInit) {
  84         CharsetRecognizer *tempArray[] = {
  85             new CharsetRecog_UTF8(),
  86
  87             new CharsetRecog_UTF_16_BE(),
  88             new CharsetRecog_UTF_16_LE(),
  89             new CharsetRecog_UTF_32_BE(),
  90             new CharsetRecog_UTF_32_LE(),
  91
  92             new CharsetRecog_8859_1_en(),
  93             new CharsetRecog_8859_1_da(),
  94             new CharsetRecog_8859_1_de(),
  95             new CharsetRecog_8859_1_es(),
  96             new CharsetRecog_8859_1_fr(),
  97             new CharsetRecog_8859_1_it(),
  98             new CharsetRecog_8859_1_nl(),
  99             new CharsetRecog_8859_1_no(),
 100             new CharsetRecog_8859_1_pt(),
 101             new CharsetRecog_8859_1_sv(),
 102             new CharsetRecog_8859_2_cs(),
 103             new CharsetRecog_8859_2_hu(),
 104             new CharsetRecog_8859_2_pl(),
 105             new CharsetRecog_8859_2_ro(),
 106             new CharsetRecog_8859_5_ru(),
 107             new CharsetRecog_8859_6_ar(),
 108             new CharsetRecog_8859_7_el(),
 109             new CharsetRecog_8859_8_I_he(),
 110             new CharsetRecog_8859_8_he(),
 111             new CharsetRecog_windows_1251(),
 112             new CharsetRecog_windows_1256(),
 113             new CharsetRecog_KOI8_R(),
 114             new CharsetRecog_8859_9_tr(),
 115             new CharsetRecog_sjis(),
 116             new CharsetRecog_gb_18030(),
 117             new CharsetRecog_euc_jp(),
 118             new CharsetRecog_euc_kr(),
 119             new CharsetRecog_big5(),
 120
 121             new CharsetRecog_2022JP(),
 122             new CharsetRecog_2022KR(),
 123             new CharsetRecog_2022CN()
 124         };
 125         int32_t rCount = ARRAY_SIZE(tempArray);
 126         int32_t r;
 127
 128         recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
 129
 130         if (recognizers == NULL) {
 131             status = U_MEMORY_ALLOCATION_ERROR;
 132         } else {
 133             for (r = 0; r < rCount; r += 1) {
 134                 recognizers[r] = tempArray[r];
 135
 136                 if (recognizers[r] == NULL) {
 137                     status = U_MEMORY_ALLOCATION_ERROR;
 138                     break;
 139                 }
 140             }
 141         }
 142
 143         if (U_SUCCESS(status)) {
 144             umtx_lock(NULL);
 145             if (fCSRecognizers == NULL) {
 146                 fCSRecognizers = recognizers;
 147                 fCSRecognizers_size = rCount;
 148             }
 149             umtx_unlock(NULL);
 150         }
 151
 152         if (fCSRecognizers != recognizers) {
 153             for (r = 0; r < rCount; r += 1) {
 154                 delete recognizers[r];
 155                 recognizers[r] = NULL;
 156             }
 157
 158             DELETE_ARRAY(recognizers);
 159         }
 160
 161         recognizers = NULL;
 162         ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
 163     }
 164 }
 165
 166 CharsetDetector::CharsetDetector(UErrorCode &status)
 167   : textIn(new InputText()), resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
 168 {
 169     if (U_FAILURE(status)) {
 170         return;
 171     }
 172
 173     setRecognizers(status);
 174
 175     if (U_FAILURE(status)) {
 176         return;
 177     }
 178
 179     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
 180
 181     if (resultArray == NULL) {
 182         status = U_MEMORY_ALLOCATION_ERROR;
 183         return;
 184     }
 185
 186     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
 187         resultArray[i] = new CharsetMatch();
 188
 189         if (resultArray[i] == NULL) {
 190             status = U_MEMORY_ALLOCATION_ERROR;
 191             break;
 192         }
 193     }
 194 }
 195
 196 CharsetDetector::~CharsetDetector()
 197 {
 198     delete textIn;
 199
 200     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
 201         delete resultArray[i];
 202     }
 203
 204     uprv_free(resultArray);
 205 }
 206
 207 void CharsetDetector::setText(const char *in, int32_t len)
 208 {
 209     textIn->setText(in, len);
 210     fFreshTextSet = TRUE;
 211 }
 212
 213 UBool CharsetDetector::setStripTagsFlag(UBool flag)
 214 {
 215     UBool temp = fStripTags;
 216     fStripTags = flag;
 217     fFreshTextSet = TRUE;
 218     return temp;
 219 }
 220
 221 UBool CharsetDetector::getStripTagsFlag() const
 222 {
 223     return fStripTags;
 224 }
 225
 226 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
 227 {
 228     textIn->setDeclaredEncoding(encoding,len);
 229 }
 230
 231 int32_t CharsetDetector::getDetectableCount()
 232 {
 233     UErrorCode status = U_ZERO_ERROR;
 234
 235     setRecognizers(status);
 236
 237     return fCSRecognizers_size;
 238 }
 239
 240 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
 241 {
 242     int32_t maxMatchesFound = 0;
 243
 244     detectAll(maxMatchesFound, status);
 245
 246     if(maxMatchesFound > 0) {
 247         return resultArray[0];
 248     } else {
 249         return NULL;
 250     }
 251 }
 252
 253 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
 254 {
 255     if(!textIn->isSet()) {
 256         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
 257
 258         return NULL;
 259     } else if(fFreshTextSet) {
 260         CharsetRecognizer *csr;
 261         int32_t            detectResults;
 262         int32_t            confidence;
 263
 264         textIn->MungeInput(fStripTags);
 265
 266         // Iterate over all possible charsets, remember all that
 267         // give a match quality > 0.
 268         resultCount = 0;
 269         for (int32_t i = 0; i < fCSRecognizers_size; i += 1) {
 270             csr = fCSRecognizers[i];
 271             detectResults = csr->match(textIn);
 272             confidence = detectResults;
 273
 274             if (confidence > 0)  {
 275                 resultArray[resultCount++]->set(textIn, csr, confidence);
 276             }
 277         }
 278
 279         for(int32_t i = resultCount; i < fCSRecognizers_size; i += 1) {
 280             resultArray[i]->set(textIn, 0, 0);
 281         }
 282
 283         uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
 284         ////Bubble sort
 285         //for(int32_t i = resultCount; i > 1; i -= 1) {
 286         //    for(int32_t j = 0; j < i-1; j += 1) {
 287         //        if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
 288         //            CharsetMatch *temp = resultArray[j];
 289         //            resultArray[j] = resultArray[j+1];
 290         //            resultArray[j+1] = temp;
 291         //        }
 292         //    }
 293         //}
 294
 295         fFreshTextSet = FALSE;
 296     }
 297
 298     maxMatchesFound = resultCount;
 299
 300     return resultArray;
 301 }
 302
 303 const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
 304 {
 305     if( index > fCSRecognizers_size-1 || index < 0) {
 306         status = U_INDEX_OUTOFBOUNDS_ERROR;
 307
 308         return 0;
 309     } else {
 310         return fCSRecognizers[index]->getName();
 311     }
 312 }
 313
 314 U_NAMESPACE_END
 315
 316 U_CDECL_BEGIN
 317 typedef struct {
 318     int32_t currIndex;
 319 } Context;
 320
 321
 322
 323 static void U_CALLCONV
 324 enumClose(UEnumeration *en) {
 325     if(en->context != NULL) {
 326         DELETE_ARRAY(en->context);
 327     }
 328
 329     DELETE_ARRAY(en);
 330 }
 331
 332 static int32_t U_CALLCONV
 333 enumCount(UEnumeration *, UErrorCode *) {
 334     return fCSRecognizers_size;
 335 }
 336
 337 static const char* U_CALLCONV
 338 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode *status) {
 339     if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
 340         if(resultLength != NULL) {
 341             *resultLength = 0;
 342         }
 343         return NULL;
 344     }
 345     const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
 346     if(resultLength != NULL) {
 347         *resultLength = (int32_t)uprv_strlen(currName);
 348     }
 349     ((Context *)en->context)->currIndex++;
 350
 351     return currName;
 352 }
 353
 354 static void U_CALLCONV
 355 enumReset(UEnumeration *en, UErrorCode *) {
 356     ((Context *)en->context)->currIndex = 0;
 357 }
 358
 359 static const UEnumeration gCSDetEnumeration = {
 360     NULL,
 361     NULL,
 362     enumClose,
 363     enumCount,
 364     uenum_unextDefault,
 365     enumNext,
 366     enumReset
 367 };
 368
 369 U_CAPI  UEnumeration * U_EXPORT2
 370 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status)
 371 {
 372     if(U_FAILURE(*status)) {
 373         return 0;
 374     }
 375
 376     /* Initialize recognized charsets. */
 377     CharsetDetector::getDetectableCount();
 378
 379     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
 380     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
 381     en->context = (void*)NEW_ARRAY(Context, 1);
 382     uprv_memset(en->context, 0, sizeof(Context));
 383     return en;
 384 }
 385 U_CDECL_END
 386
 387 #endif