icuSources/common/normlzr.cpp

   1 /*
   2  *************************************************************************
   3  * COPYRIGHT:
   4  * Copyright (c) 1996-2010, International Business Machines Corporation and
   5  * others. All Rights Reserved.
   6  *************************************************************************
   7  */
   8
   9 #include "unicode/utypes.h"
  10
  11 #if !UCONFIG_NO_NORMALIZATION
  12
  13 #include "unicode/uniset.h"
  14 #include "unicode/unistr.h"
  15 #include "unicode/chariter.h"
  16 #include "unicode/schriter.h"
  17 #include "unicode/uchriter.h"
  18 #include "unicode/normlzr.h"
  19 #include "cmemory.h"
  20 #include "normalizer2impl.h"
  21 #include "uprops.h"  // for uniset_getUnicode32Instance()
  22
  23 U_NAMESPACE_BEGIN
  24
  25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
  26
  27 //-------------------------------------------------------------------------
  28 // Constructors and other boilerplate
  29 //-------------------------------------------------------------------------
  30
  31 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
  32     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
  33     text(new StringCharacterIterator(str)),
  34     currentIndex(0), nextIndex(0),
  35     buffer(), bufferPos(0)
  36 {
  37     init();
  38 }
  39
  40 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
  41     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
  42     text(new UCharCharacterIterator(str, length)),
  43     currentIndex(0), nextIndex(0),
  44     buffer(), bufferPos(0)
  45 {
  46     init();
  47 }
  48
  49 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
  50     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
  51     text(iter.clone()),
  52     currentIndex(0), nextIndex(0),
  53     buffer(), bufferPos(0)
  54 {
  55     init();
  56 }
  57
  58 Normalizer::Normalizer(const Normalizer &copy) :
  59     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
  60     text(copy.text->clone()),
  61     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
  62     buffer(copy.buffer), bufferPos(copy.bufferPos)
  63 {
  64     init();
  65 }
  66
  67 static const UChar _NUL=0;
  68
  69 void
  70 Normalizer::init() {
  71     UErrorCode errorCode=U_ZERO_ERROR;
  72     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
  73     if(fOptions&UNORM_UNICODE_3_2) {
  74         delete fFilteredNorm2;
  75         fNorm2=fFilteredNorm2=
  76             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
  77     }
  78     if(U_FAILURE(errorCode)) {
  79         errorCode=U_ZERO_ERROR;
  80         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
  81     }
  82 }
  83
  84 Normalizer::~Normalizer()
  85 {
  86     delete fFilteredNorm2;
  87     delete text;
  88 }
  89
  90 Normalizer*
  91 Normalizer::clone() const
  92 {
  93     return new Normalizer(*this);
  94 }
  95
  96 /**
  97  * Generates a hash code for this iterator.
  98  */
  99 int32_t Normalizer::hashCode() const
 100 {
 101     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
 102 }
 103
 104 UBool Normalizer::operator==(const Normalizer& that) const
 105 {
 106     return
 107         this==&that ||
 108         (fUMode==that.fUMode &&
 109         fOptions==that.fOptions &&
 110         *text==*that.text &&
 111         buffer==that.buffer &&
 112         bufferPos==that.bufferPos &&
 113         nextIndex==that.nextIndex);
 114 }
 115
 116 //-------------------------------------------------------------------------
 117 // Static utility methods
 118 //-------------------------------------------------------------------------
 119
 120 void U_EXPORT2
 121 Normalizer::normalize(const UnicodeString& source,
 122                       UNormalizationMode mode, int32_t options,
 123                       UnicodeString& result,
 124                       UErrorCode &status) {
 125     if(source.isBogus() || U_FAILURE(status)) {
 126         result.setToBogus();
 127         if(U_SUCCESS(status)) {
 128             status=U_ILLEGAL_ARGUMENT_ERROR;
 129         }
 130     } else {
 131         UnicodeString localDest;
 132         UnicodeString *dest;
 133
 134         if(&source!=&result) {
 135             dest=&result;
 136         } else {
 137             // the source and result strings are the same object, use a temporary one
 138             dest=&localDest;
 139         }
 140         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
 141         if(U_SUCCESS(status)) {
 142             if(options&UNORM_UNICODE_3_2) {
 143                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
 144                     normalize(source, *dest, status);
 145             } else {
 146                 n2->normalize(source, *dest, status);
 147             }
 148         }
 149         if(dest==&localDest && U_SUCCESS(status)) {
 150             result=*dest;
 151         }
 152     }
 153 }
 154
 155 void U_EXPORT2
 156 Normalizer::compose(const UnicodeString& source,
 157                     UBool compat, int32_t options,
 158                     UnicodeString& result,
 159                     UErrorCode &status) {
 160     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
 161 }
 162
 163 void U_EXPORT2
 164 Normalizer::decompose(const UnicodeString& source,
 165                       UBool compat, int32_t options,
 166                       UnicodeString& result,
 167                       UErrorCode &status) {
 168     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
 169 }
 170
 171 UNormalizationCheckResult
 172 Normalizer::quickCheck(const UnicodeString& source,
 173                        UNormalizationMode mode, int32_t options,
 174                        UErrorCode &status) {
 175     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
 176     if(U_SUCCESS(status)) {
 177         if(options&UNORM_UNICODE_3_2) {
 178             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
 179                 quickCheck(source, status);
 180         } else {
 181             return n2->quickCheck(source, status);
 182         }
 183     } else {
 184         return UNORM_MAYBE;
 185     }
 186 }
 187
 188 UBool
 189 Normalizer::isNormalized(const UnicodeString& source,
 190                          UNormalizationMode mode, int32_t options,
 191                          UErrorCode &status) {
 192     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
 193     if(U_SUCCESS(status)) {
 194         if(options&UNORM_UNICODE_3_2) {
 195             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
 196                 isNormalized(source, status);
 197         } else {
 198             return n2->isNormalized(source, status);
 199         }
 200     } else {
 201         return FALSE;
 202     }
 203 }
 204
 205 UnicodeString & U_EXPORT2
 206 Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
 207                         UnicodeString &result,
 208                         UNormalizationMode mode, int32_t options,
 209                         UErrorCode &errorCode) {
 210     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
 211         result.setToBogus();
 212         if(U_SUCCESS(errorCode)) {
 213             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 214         }
 215     } else {
 216         UnicodeString localDest;
 217         UnicodeString *dest;
 218
 219         if(&right!=&result) {
 220             dest=&result;
 221         } else {
 222             // the right and result strings are the same object, use a temporary one
 223             dest=&localDest;
 224         }
 225         *dest=left;
 226         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
 227         if(U_SUCCESS(errorCode)) {
 228             if(options&UNORM_UNICODE_3_2) {
 229                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
 230                     append(*dest, right, errorCode);
 231             } else {
 232                 n2->append(*dest, right, errorCode);
 233             }
 234         }
 235         if(dest==&localDest && U_SUCCESS(errorCode)) {
 236             result=*dest;
 237         }
 238     }
 239     return result;
 240 }
 241
 242 //-------------------------------------------------------------------------
 243 // Iteration API
 244 //-------------------------------------------------------------------------
 245
 246 /**
 247  * Return the current character in the normalized text.
 248  */
 249 UChar32 Normalizer::current() {
 250     if(bufferPos<buffer.length() || nextNormalize()) {
 251         return buffer.char32At(bufferPos);
 252     } else {
 253         return DONE;
 254     }
 255 }
 256
 257 /**
 258  * Return the next character in the normalized text and advance
 259  * the iteration position by one.  If the end
 260  * of the text has already been reached, {@link #DONE} is returned.
 261  */
 262 UChar32 Normalizer::next() {
 263     if(bufferPos<buffer.length() ||  nextNormalize()) {
 264         UChar32 c=buffer.char32At(bufferPos);
 265         bufferPos+=UTF_CHAR_LENGTH(c);
 266         return c;
 267     } else {
 268         return DONE;
 269     }
 270 }
 271
 272 /**
 273  * Return the previous character in the normalized text and decrement
 274  * the iteration position by one.  If the beginning
 275  * of the text has already been reached, {@link #DONE} is returned.
 276  */
 277 UChar32 Normalizer::previous() {
 278     if(bufferPos>0 || previousNormalize()) {
 279         UChar32 c=buffer.char32At(bufferPos-1);
 280         bufferPos-=UTF_CHAR_LENGTH(c);
 281         return c;
 282     } else {
 283         return DONE;
 284     }
 285 }
 286
 287 void Normalizer::reset() {
 288     currentIndex=nextIndex=text->setToStart();
 289     clearBuffer();
 290 }
 291
 292 void
 293 Normalizer::setIndexOnly(int32_t index) {
 294     text->setIndex(index);  // pins index
 295     currentIndex=nextIndex=text->getIndex();
 296     clearBuffer();
 297 }
 298
 299 /**
 300  * Return the first character in the normalized text.  This resets
 301  * the <tt>Normalizer's</tt> position to the beginning of the text.
 302  */
 303 UChar32 Normalizer::first() {
 304     reset();
 305     return next();
 306 }
 307
 308 /**
 309  * Return the last character in the normalized text.  This resets
 310  * the <tt>Normalizer's</tt> position to be just before the
 311  * the input text corresponding to that normalized character.
 312  */
 313 UChar32 Normalizer::last() {
 314     currentIndex=nextIndex=text->setToEnd();
 315     clearBuffer();
 316     return previous();
 317 }
 318
 319 /**
 320  * Retrieve the current iteration position in the input text that is
 321  * being normalized.  This method is useful in applications such as
 322  * searching, where you need to be able to determine the position in
 323  * the input text that corresponds to a given normalized output character.
 324  * <p>
 325  * <b>Note:</b> This method sets the position in the <em>input</em>, while
 326  * {@link #next} and {@link #previous} iterate through characters in the
 327  * <em>output</em>.  This means that there is not necessarily a one-to-one
 328  * correspondence between characters returned by <tt>next</tt> and
 329  * <tt>previous</tt> and the indices passed to and returned from
 330  * <tt>setIndex</tt> and {@link #getIndex}.
 331  *
 332  */
 333 int32_t Normalizer::getIndex() const {
 334     if(bufferPos<buffer.length()) {
 335         return currentIndex;
 336     } else {
 337         return nextIndex;
 338     }
 339 }
 340
 341 /**
 342  * Retrieve the index of the start of the input text.  This is the begin index
 343  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
 344  * over which this <tt>Normalizer</tt> is iterating
 345  */
 346 int32_t Normalizer::startIndex() const {
 347     return text->startIndex();
 348 }
 349
 350 /**
 351  * Retrieve the index of the end of the input text.  This is the end index
 352  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
 353  * over which this <tt>Normalizer</tt> is iterating
 354  */
 355 int32_t Normalizer::endIndex() const {
 356     return text->endIndex();
 357 }
 358
 359 //-------------------------------------------------------------------------
 360 // Property access methods
 361 //-------------------------------------------------------------------------
 362
 363 void
 364 Normalizer::setMode(UNormalizationMode newMode)
 365 {
 366     fUMode = newMode;
 367     init();
 368 }
 369
 370 UNormalizationMode
 371 Normalizer::getUMode() const
 372 {
 373     return fUMode;
 374 }
 375
 376 void
 377 Normalizer::setOption(int32_t option,
 378                       UBool value)
 379 {
 380     if (value) {
 381         fOptions |= option;
 382     } else {
 383         fOptions &= (~option);
 384     }
 385     init();
 386 }
 387
 388 UBool
 389 Normalizer::getOption(int32_t option) const
 390 {
 391     return (fOptions & option) != 0;
 392 }
 393
 394 /**
 395  * Set the input text over which this <tt>Normalizer</tt> will iterate.
 396  * The iteration position is set to the beginning of the input text.
 397  */
 398 void
 399 Normalizer::setText(const UnicodeString& newText,
 400                     UErrorCode &status)
 401 {
 402     if (U_FAILURE(status)) {
 403         return;
 404     }
 405     CharacterIterator *newIter = new StringCharacterIterator(newText);
 406     if (newIter == NULL) {
 407         status = U_MEMORY_ALLOCATION_ERROR;
 408         return;
 409     }
 410     delete text;
 411     text = newIter;
 412     reset();
 413 }
 414
 415 /**
 416  * Set the input text over which this <tt>Normalizer</tt> will iterate.
 417  * The iteration position is set to the beginning of the string.
 418  */
 419 void
 420 Normalizer::setText(const CharacterIterator& newText,
 421                     UErrorCode &status)
 422 {
 423     if (U_FAILURE(status)) {
 424         return;
 425     }
 426     CharacterIterator *newIter = newText.clone();
 427     if (newIter == NULL) {
 428         status = U_MEMORY_ALLOCATION_ERROR;
 429         return;
 430     }
 431     delete text;
 432     text = newIter;
 433     reset();
 434 }
 435
 436 void
 437 Normalizer::setText(const UChar* newText,
 438                     int32_t length,
 439                     UErrorCode &status)
 440 {
 441     if (U_FAILURE(status)) {
 442         return;
 443     }
 444     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
 445     if (newIter == NULL) {
 446         status = U_MEMORY_ALLOCATION_ERROR;
 447         return;
 448     }
 449     delete text;
 450     text = newIter;
 451     reset();
 452 }
 453
 454 /**
 455  * Copies the text under iteration into the UnicodeString referred to by "result".
 456  * @param result Receives a copy of the text under iteration.
 457  */
 458 void
 459 Normalizer::getText(UnicodeString&  result)
 460 {
 461     text->getText(result);
 462 }
 463
 464 //-------------------------------------------------------------------------
 465 // Private utility methods
 466 //-------------------------------------------------------------------------
 467
 468 void Normalizer::clearBuffer() {
 469     buffer.remove();
 470     bufferPos=0;
 471 }
 472
 473 UBool
 474 Normalizer::nextNormalize() {
 475     clearBuffer();
 476     currentIndex=nextIndex;
 477     text->setIndex(nextIndex);
 478     if(!text->hasNext()) {
 479         return FALSE;
 480     }
 481     // Skip at least one character so we make progress.
 482     UnicodeString segment(text->next32PostInc());
 483     while(text->hasNext()) {
 484         UChar32 c;
 485         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
 486             text->move32(-1, CharacterIterator::kCurrent);
 487             break;
 488         }
 489         segment.append(c);
 490     }
 491     nextIndex=text->getIndex();
 492     UErrorCode errorCode=U_ZERO_ERROR;
 493     fNorm2->normalize(segment, buffer, errorCode);
 494     return U_SUCCESS(errorCode) && !buffer.isEmpty();
 495 }
 496
 497 UBool
 498 Normalizer::previousNormalize() {
 499     clearBuffer();
 500     nextIndex=currentIndex;
 501     text->setIndex(currentIndex);
 502     if(!text->hasPrevious()) {
 503         return FALSE;
 504     }
 505     UnicodeString segment;
 506     while(text->hasPrevious()) {
 507         UChar32 c=text->previous32();
 508         segment.insert(0, c);
 509         if(fNorm2->hasBoundaryBefore(c)) {
 510             break;
 511         }
 512     }
 513     currentIndex=text->getIndex();
 514     UErrorCode errorCode=U_ZERO_ERROR;
 515     fNorm2->normalize(segment, buffer, errorCode);
 516     bufferPos=buffer.length();
 517     return U_SUCCESS(errorCode) && !buffer.isEmpty();
 518 }
 519
 520 U_NAMESPACE_END
 521
 522 #endif /* #if !UCONFIG_NO_NORMALIZATION */