icuSources/i18n/csdetect.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2009, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_CONVERSION
  11
  12 #include "unicode/ucsdet.h"
  13
  14 #include "csdetect.h"
  15 #include "csmatch.h"
  16 #include "uenumimp.h"
  17
  18 #include "cmemory.h"
  19 #include "cstring.h"
  20 #include "umutex.h"
  21 #include "ucln_in.h"
  22 #include "uarrsort.h"
  23 #include "inputext.h"
  24 #include "csrsbcs.h"
  25 #include "csrmbcs.h"
  26 #include "csrutf8.h"
  27 #include "csrucode.h"
  28 #include "csr2022.h"
  29
  30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  31
  32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
  33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
  34
  35 U_CDECL_BEGIN
  36 static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
  37
  38 static int32_t fCSRecognizers_size = 0;
  39
  40 static UBool U_CALLCONV csdet_cleanup(void)
  41 {
  42     if (fCSRecognizers != NULL) {
  43         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
  44             delete fCSRecognizers[r];
  45             fCSRecognizers[r] = NULL;
  46         }
  47
  48         DELETE_ARRAY(fCSRecognizers);
  49         fCSRecognizers = NULL;
  50         fCSRecognizers_size = 0;
  51     }
  52
  53     return TRUE;
  54 }
  55
  56 static int32_t U_CALLCONV
  57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
  58 {
  59     U_NAMESPACE_USE
  60
  61     const CharsetMatch **csm_l = (const CharsetMatch **) left;
  62     const CharsetMatch **csm_r = (const CharsetMatch **) right;
  63
  64     // NOTE: compare is backwards to sort from highest to lowest.
  65     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
  66 }
  67
  68 U_CDECL_END
  69
  70 U_NAMESPACE_BEGIN
  71
  72 void CharsetDetector::setRecognizers(UErrorCode &status)
  73 {
  74     UBool needsInit;
  75     CharsetRecognizer **recognizers;
  76
  77     if (U_FAILURE(status)) {
  78         return;
  79     }
  80
  81     UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
  82
  83     if (needsInit) {
  84         CharsetRecognizer *tempArray[] = {
  85             new CharsetRecog_UTF8(),
  86
  87             new CharsetRecog_UTF_16_BE(),
  88             new CharsetRecog_UTF_16_LE(),
  89             new CharsetRecog_UTF_32_BE(),
  90             new CharsetRecog_UTF_32_LE(),
  91
  92             new CharsetRecog_8859_1_en(),
  93             new CharsetRecog_8859_1_da(),
  94             new CharsetRecog_8859_1_de(),
  95             new CharsetRecog_8859_1_es(),
  96             new CharsetRecog_8859_1_fr(),
  97             new CharsetRecog_8859_1_it(),
  98             new CharsetRecog_8859_1_nl(),
  99             new CharsetRecog_8859_1_no(),
 100             new CharsetRecog_8859_1_pt(),
 101             new CharsetRecog_8859_1_sv(),
 102             new CharsetRecog_8859_2_cs(),
 103             new CharsetRecog_8859_2_hu(),
 104             new CharsetRecog_8859_2_pl(),
 105             new CharsetRecog_8859_2_ro(),
 106             new CharsetRecog_8859_5_ru(),
 107             new CharsetRecog_8859_6_ar(),
 108             new CharsetRecog_8859_7_el(),
 109             new CharsetRecog_8859_8_I_he(),
 110             new CharsetRecog_8859_8_he(),
 111             new CharsetRecog_windows_1251(),
 112             new CharsetRecog_windows_1256(),
 113             new CharsetRecog_KOI8_R(),
 114             new CharsetRecog_8859_9_tr(),
 115             new CharsetRecog_sjis(),
 116             new CharsetRecog_gb_18030(),
 117             new CharsetRecog_euc_jp(),
 118             new CharsetRecog_euc_kr(),
 119             new CharsetRecog_big5(),
 120
 121             new CharsetRecog_2022JP(),
 122             new CharsetRecog_2022KR(),
 123             new CharsetRecog_2022CN(),
 124
 125             new CharsetRecog_IBM424_he_rtl(),
 126             new CharsetRecog_IBM424_he_ltr(),
 127             new CharsetRecog_IBM420_ar_rtl(),
 128             new CharsetRecog_IBM420_ar_ltr()
 129         };
 130         int32_t rCount = ARRAY_SIZE(tempArray);
 131         int32_t r;
 132
 133         recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
 134
 135         if (recognizers == NULL) {
 136             status = U_MEMORY_ALLOCATION_ERROR;
 137             return;
 138         } else {
 139             for (r = 0; r < rCount; r += 1) {
 140                 recognizers[r] = tempArray[r];
 141
 142                 if (recognizers[r] == NULL) {
 143                     status = U_MEMORY_ALLOCATION_ERROR;
 144                     break;
 145                 }
 146             }
 147         }
 148
 149         if (U_SUCCESS(status)) {
 150             umtx_lock(NULL);
 151             if (fCSRecognizers == NULL) {
 152                 fCSRecognizers_size = rCount;
 153                 fCSRecognizers = recognizers;
 154             }
 155             umtx_unlock(NULL);
 156         }
 157
 158         if (fCSRecognizers != recognizers) {
 159             for (r = 0; r < rCount; r += 1) {
 160                 delete recognizers[r];
 161                 recognizers[r] = NULL;
 162             }
 163
 164             DELETE_ARRAY(recognizers);
 165         }
 166
 167         recognizers = NULL;
 168         ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
 169     }
 170 }
 171
 172 CharsetDetector::CharsetDetector(UErrorCode &status)
 173   : textIn(new InputText(status)), resultArray(NULL),
 174     resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
 175 {
 176     if (U_FAILURE(status)) {
 177         return;
 178     }
 179
 180     setRecognizers(status);
 181
 182     if (U_FAILURE(status)) {
 183         return;
 184     }
 185
 186     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
 187
 188     if (resultArray == NULL) {
 189         status = U_MEMORY_ALLOCATION_ERROR;
 190         return;
 191     }
 192
 193     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
 194         resultArray[i] = new CharsetMatch();
 195
 196         if (resultArray[i] == NULL) {
 197             status = U_MEMORY_ALLOCATION_ERROR;
 198             break;
 199         }
 200     }
 201 }
 202
 203 CharsetDetector::~CharsetDetector()
 204 {
 205     delete textIn;
 206
 207     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
 208         delete resultArray[i];
 209     }
 210
 211     uprv_free(resultArray);
 212 }
 213
 214 void CharsetDetector::setText(const char *in, int32_t len)
 215 {
 216     textIn->setText(in, len);
 217     fFreshTextSet = TRUE;
 218 }
 219
 220 UBool CharsetDetector::setStripTagsFlag(UBool flag)
 221 {
 222     UBool temp = fStripTags;
 223     fStripTags = flag;
 224     fFreshTextSet = TRUE;
 225     return temp;
 226 }
 227
 228 UBool CharsetDetector::getStripTagsFlag() const
 229 {
 230     return fStripTags;
 231 }
 232
 233 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
 234 {
 235     textIn->setDeclaredEncoding(encoding,len);
 236 }
 237
 238 int32_t CharsetDetector::getDetectableCount()
 239 {
 240     UErrorCode status = U_ZERO_ERROR;
 241
 242     setRecognizers(status);
 243
 244     return fCSRecognizers_size;
 245 }
 246
 247 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
 248 {
 249     int32_t maxMatchesFound = 0;
 250
 251     detectAll(maxMatchesFound, status);
 252
 253     if(maxMatchesFound > 0) {
 254         return resultArray[0];
 255     } else {
 256         return NULL;
 257     }
 258 }
 259
 260 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
 261 {
 262     if(!textIn->isSet()) {
 263         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
 264
 265         return NULL;
 266     } else if(fFreshTextSet) {
 267         CharsetRecognizer *csr;
 268         int32_t            detectResults;
 269         int32_t            confidence;
 270         int32_t            i;
 271
 272         textIn->MungeInput(fStripTags);
 273
 274         // Iterate over all possible charsets, remember all that
 275         // give a match quality > 0.
 276         resultCount = 0;
 277         for (i = 0; i < fCSRecognizers_size; i += 1) {
 278             csr = fCSRecognizers[i];
 279             detectResults = csr->match(textIn);
 280             confidence = detectResults;
 281
 282             if (confidence > 0)  {
 283                 resultArray[resultCount++]->set(textIn, csr, confidence);
 284             }
 285         }
 286
 287         for(i = resultCount; i < fCSRecognizers_size; i += 1) {
 288             resultArray[i]->set(textIn, 0, 0);
 289         }
 290
 291         uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
 292
 293         // Remove duplicate charsets from the results.
 294         // Simple minded, brute force approach - check each entry against all that follow.
 295         // The first entry of any duplicated set is the one that should be kept because it will
 296         // be the one with the highest confidence rating.
 297         //   (Duplicate matches have different languages, only the charset is the same)
 298         // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
 299         // deleted, just reordered, with the unwanted duplicates placed after the good results.
 300         int32_t j, k;
 301         for (i=0; i<resultCount; i++) {
 302             const char *charSetName = resultArray[i]->getName();
 303             for (j=i+1; j<resultCount; ) {
 304                 if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
 305                     // Not a duplicate.
 306                     j++;
 307                 } else {
 308                     // Duplicate entry at index j.
 309                     CharsetMatch *duplicate = resultArray[j];
 310                     for (k=j; k<resultCount-1; k++) {
 311                         resultArray[k] = resultArray[k+1];
 312                     }
 313                     resultCount--;
 314                     resultArray[resultCount] = duplicate;
 315                 }
 316             }
 317         }
 318
 319         fFreshTextSet = FALSE;
 320     }
 321
 322     maxMatchesFound = resultCount;
 323
 324     return resultArray;
 325 }
 326
 327 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
 328 {
 329     if( index > fCSRecognizers_size-1 || index < 0) {
 330         status = U_INDEX_OUTOFBOUNDS_ERROR;
 331
 332         return 0;
 333     } else {
 334         return fCSRecognizers[index]->getName();
 335     }
 336 }*/
 337
 338 U_NAMESPACE_END
 339
 340 U_CDECL_BEGIN
 341 typedef struct {
 342     int32_t currIndex;
 343 } Context;
 344
 345
 346
 347 static void U_CALLCONV
 348 enumClose(UEnumeration *en) {
 349     if(en->context != NULL) {
 350         DELETE_ARRAY(en->context);
 351     }
 352
 353     DELETE_ARRAY(en);
 354 }
 355
 356 static int32_t U_CALLCONV
 357 enumCount(UEnumeration *, UErrorCode *) {
 358     return fCSRecognizers_size;
 359 }
 360
 361 static const char* U_CALLCONV
 362 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
 363     if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
 364         if(resultLength != NULL) {
 365             *resultLength = 0;
 366         }
 367         return NULL;
 368     }
 369     const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
 370     if(resultLength != NULL) {
 371         *resultLength = (int32_t)uprv_strlen(currName);
 372     }
 373     ((Context *)en->context)->currIndex++;
 374
 375     return currName;
 376 }
 377
 378 static void U_CALLCONV
 379 enumReset(UEnumeration *en, UErrorCode *) {
 380     ((Context *)en->context)->currIndex = 0;
 381 }
 382
 383 static const UEnumeration gCSDetEnumeration = {
 384     NULL,
 385     NULL,
 386     enumClose,
 387     enumCount,
 388     uenum_unextDefault,
 389     enumNext,
 390     enumReset
 391 };
 392
 393 U_CAPI  UEnumeration * U_EXPORT2
 394 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
 395 {
 396     U_NAMESPACE_USE
 397
 398     if(U_FAILURE(*status)) {
 399         return 0;
 400     }
 401
 402     /* Initialize recognized charsets. */
 403     CharsetDetector::getDetectableCount();
 404
 405     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
 406     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
 407     en->context = (void*)NEW_ARRAY(Context, 1);
 408     uprv_memset(en->context, 0, sizeof(Context));
 409     return en;
 410 }
 411 U_CDECL_END
 412
 413 #endif
 414