icuSources/common/normlzr.cpp

   1 /*
   2  *************************************************************************
   3  * COPYRIGHT:
   4  * Copyright (c) 1996-2012, International Business Machines Corporation and
   5  * others. All Rights Reserved.
   6  *************************************************************************
   7  */
   8
   9 #include "unicode/utypes.h"
  10
  11 #if !UCONFIG_NO_NORMALIZATION
  12
  13 #include "unicode/uniset.h"
  14 #include "unicode/unistr.h"
  15 #include "unicode/chariter.h"
  16 #include "unicode/schriter.h"
  17 #include "unicode/uchriter.h"
  18 #include "unicode/normlzr.h"
  19 #include "unicode/utf16.h"
  20 #include "cmemory.h"
  21 #include "normalizer2impl.h"
  22 #include "uprops.h"  // for uniset_getUnicode32Instance()
  23
  24 U_NAMESPACE_BEGIN
  25
  26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
  27
  28 //-------------------------------------------------------------------------
  29 // Constructors and other boilerplate
  30 //-------------------------------------------------------------------------
  31
  32 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
  33     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
  34     text(new StringCharacterIterator(str)),
  35     currentIndex(0), nextIndex(0),
  36     buffer(), bufferPos(0)
  37 {
  38     init();
  39 }
  40
  41 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
  42     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
  43     text(new UCharCharacterIterator(str, length)),
  44     currentIndex(0), nextIndex(0),
  45     buffer(), bufferPos(0)
  46 {
  47     init();
  48 }
  49
  50 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
  51     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
  52     text(iter.clone()),
  53     currentIndex(0), nextIndex(0),
  54     buffer(), bufferPos(0)
  55 {
  56     init();
  57 }
  58
  59 Normalizer::Normalizer(const Normalizer &copy) :
  60     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
  61     text(copy.text->clone()),
  62     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
  63     buffer(copy.buffer), bufferPos(copy.bufferPos)
  64 {
  65     init();
  66 }
  67
  68 void
  69 Normalizer::init() {
  70     UErrorCode errorCode=U_ZERO_ERROR;
  71     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
  72     if(fOptions&UNORM_UNICODE_3_2) {
  73         delete fFilteredNorm2;
  74         fNorm2=fFilteredNorm2=
  75             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
  76     }
  77     if(U_FAILURE(errorCode)) {
  78         errorCode=U_ZERO_ERROR;
  79         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
  80     }
  81 }
  82
  83 Normalizer::~Normalizer()
  84 {
  85     delete fFilteredNorm2;
  86     delete text;
  87 }
  88
  89 Normalizer*
  90 Normalizer::clone() const
  91 {
  92     return new Normalizer(*this);
  93 }
  94
  95 /**
  96  * Generates a hash code for this iterator.
  97  */
  98 int32_t Normalizer::hashCode() const
  99 {
 100     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
 101 }
 102
 103 UBool Normalizer::operator==(const Normalizer& that) const
 104 {
 105     return
 106         this==&that ||
 107         (fUMode==that.fUMode &&
 108         fOptions==that.fOptions &&
 109         *text==*that.text &&
 110         buffer==that.buffer &&
 111         bufferPos==that.bufferPos &&
 112         nextIndex==that.nextIndex);
 113 }
 114
 115 //-------------------------------------------------------------------------
 116 // Static utility methods
 117 //-------------------------------------------------------------------------
 118
 119 void U_EXPORT2
 120 Normalizer::normalize(const UnicodeString& source,
 121                       UNormalizationMode mode, int32_t options,
 122                       UnicodeString& result,
 123                       UErrorCode &status) {
 124     if(source.isBogus() || U_FAILURE(status)) {
 125         result.setToBogus();
 126         if(U_SUCCESS(status)) {
 127             status=U_ILLEGAL_ARGUMENT_ERROR;
 128         }
 129     } else {
 130         UnicodeString localDest;
 131         UnicodeString *dest;
 132
 133         if(&source!=&result) {
 134             dest=&result;
 135         } else {
 136             // the source and result strings are the same object, use a temporary one
 137             dest=&localDest;
 138         }
 139         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
 140         if(U_SUCCESS(status)) {
 141             if(options&UNORM_UNICODE_3_2) {
 142                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
 143                     normalize(source, *dest, status);
 144             } else {
 145                 n2->normalize(source, *dest, status);
 146             }
 147         }
 148         if(dest==&localDest && U_SUCCESS(status)) {
 149             result=*dest;
 150         }
 151     }
 152 }
 153
 154 void U_EXPORT2
 155 Normalizer::compose(const UnicodeString& source,
 156                     UBool compat, int32_t options,
 157                     UnicodeString& result,
 158                     UErrorCode &status) {
 159     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
 160 }
 161
 162 void U_EXPORT2
 163 Normalizer::decompose(const UnicodeString& source,
 164                       UBool compat, int32_t options,
 165                       UnicodeString& result,
 166                       UErrorCode &status) {
 167     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
 168 }
 169
 170 UNormalizationCheckResult
 171 Normalizer::quickCheck(const UnicodeString& source,
 172                        UNormalizationMode mode, int32_t options,
 173                        UErrorCode &status) {
 174     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
 175     if(U_SUCCESS(status)) {
 176         if(options&UNORM_UNICODE_3_2) {
 177             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
 178                 quickCheck(source, status);
 179         } else {
 180             return n2->quickCheck(source, status);
 181         }
 182     } else {
 183         return UNORM_MAYBE;
 184     }
 185 }
 186
 187 UBool
 188 Normalizer::isNormalized(const UnicodeString& source,
 189                          UNormalizationMode mode, int32_t options,
 190                          UErrorCode &status) {
 191     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
 192     if(U_SUCCESS(status)) {
 193         if(options&UNORM_UNICODE_3_2) {
 194             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
 195                 isNormalized(source, status);
 196         } else {
 197             return n2->isNormalized(source, status);
 198         }
 199     } else {
 200         return FALSE;
 201     }
 202 }
 203
 204 UnicodeString & U_EXPORT2
 205 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
 206                         UnicodeString &result,
 207                         UNormalizationMode mode, int32_t options,
 208                         UErrorCode &errorCode) {
 209     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
 210         result.setToBogus();
 211         if(U_SUCCESS(errorCode)) {
 212             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 213         }
 214     } else {
 215         UnicodeString localDest;
 216         UnicodeString *dest;
 217
 218         if(&right!=&result) {
 219             dest=&result;
 220         } else {
 221             // the right and result strings are the same object, use a temporary one
 222             dest=&localDest;
 223         }
 224         *dest=left;
 225         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
 226         if(U_SUCCESS(errorCode)) {
 227             if(options&UNORM_UNICODE_3_2) {
 228                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
 229                     append(*dest, right, errorCode);
 230             } else {
 231                 n2->append(*dest, right, errorCode);
 232             }
 233         }
 234         if(dest==&localDest && U_SUCCESS(errorCode)) {
 235             result=*dest;
 236         }
 237     }
 238     return result;
 239 }
 240
 241 //-------------------------------------------------------------------------
 242 // Iteration API
 243 //-------------------------------------------------------------------------
 244
 245 /**
 246  * Return the current character in the normalized text.
 247  */
 248 UChar32 Normalizer::current() {
 249     if(bufferPos<buffer.length() || nextNormalize()) {
 250         return buffer.char32At(bufferPos);
 251     } else {
 252         return DONE;
 253     }
 254 }
 255
 256 /**
 257  * Return the next character in the normalized text and advance
 258  * the iteration position by one.  If the end
 259  * of the text has already been reached, {@link #DONE} is returned.
 260  */
 261 UChar32 Normalizer::next() {
 262     if(bufferPos<buffer.length() ||  nextNormalize()) {
 263         UChar32 c=buffer.char32At(bufferPos);
 264         bufferPos+=U16_LENGTH(c);
 265         return c;
 266     } else {
 267         return DONE;
 268     }
 269 }
 270
 271 /**
 272  * Return the previous character in the normalized text and decrement
 273  * the iteration position by one.  If the beginning
 274  * of the text has already been reached, {@link #DONE} is returned.
 275  */
 276 UChar32 Normalizer::previous() {
 277     if(bufferPos>0 || previousNormalize()) {
 278         UChar32 c=buffer.char32At(bufferPos-1);
 279         bufferPos-=U16_LENGTH(c);
 280         return c;
 281     } else {
 282         return DONE;
 283     }
 284 }
 285
 286 void Normalizer::reset() {
 287     currentIndex=nextIndex=text->setToStart();
 288     clearBuffer();
 289 }
 290
 291 void
 292 Normalizer::setIndexOnly(int32_t index) {
 293     text->setIndex(index);  // pins index
 294     currentIndex=nextIndex=text->getIndex();
 295     clearBuffer();
 296 }
 297
 298 /**
 299  * Return the first character in the normalized text.  This resets
 300  * the <tt>Normalizer's</tt> position to the beginning of the text.
 301  */
 302 UChar32 Normalizer::first() {
 303     reset();
 304     return next();
 305 }
 306
 307 /**
 308  * Return the last character in the normalized text.  This resets
 309  * the <tt>Normalizer's</tt> position to be just before the
 310  * the input text corresponding to that normalized character.
 311  */
 312 UChar32 Normalizer::last() {
 313     currentIndex=nextIndex=text->setToEnd();
 314     clearBuffer();
 315     return previous();
 316 }
 317
 318 /**
 319  * Retrieve the current iteration position in the input text that is
 320  * being normalized.  This method is useful in applications such as
 321  * searching, where you need to be able to determine the position in
 322  * the input text that corresponds to a given normalized output character.
 323  * <p>
 324  * <b>Note:</b> This method sets the position in the <em>input</em>, while
 325  * {@link #next} and {@link #previous} iterate through characters in the
 326  * <em>output</em>.  This means that there is not necessarily a one-to-one
 327  * correspondence between characters returned by <tt>next</tt> and
 328  * <tt>previous</tt> and the indices passed to and returned from
 329  * <tt>setIndex</tt> and {@link #getIndex}.
 330  *
 331  */
 332 int32_t Normalizer::getIndex() const {
 333     if(bufferPos<buffer.length()) {
 334         return currentIndex;
 335     } else {
 336         return nextIndex;
 337     }
 338 }
 339
 340 /**
 341  * Retrieve the index of the start of the input text.  This is the begin index
 342  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
 343  * over which this <tt>Normalizer</tt> is iterating
 344  */
 345 int32_t Normalizer::startIndex() const {
 346     return text->startIndex();
 347 }
 348
 349 /**
 350  * Retrieve the index of the end of the input text.  This is the end index
 351  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
 352  * over which this <tt>Normalizer</tt> is iterating
 353  */
 354 int32_t Normalizer::endIndex() const {
 355     return text->endIndex();
 356 }
 357
 358 //-------------------------------------------------------------------------
 359 // Property access methods
 360 //-------------------------------------------------------------------------
 361
 362 void
 363 Normalizer::setMode(UNormalizationMode newMode)
 364 {
 365     fUMode = newMode;
 366     init();
 367 }
 368
 369 UNormalizationMode
 370 Normalizer::getUMode() const
 371 {
 372     return fUMode;
 373 }
 374
 375 void
 376 Normalizer::setOption(int32_t option,
 377                       UBool value)
 378 {
 379     if (value) {
 380         fOptions |= option;
 381     } else {
 382         fOptions &= (~option);
 383     }
 384     init();
 385 }
 386
 387 UBool
 388 Normalizer::getOption(int32_t option) const
 389 {
 390     return (fOptions & option) != 0;
 391 }
 392
 393 /**
 394  * Set the input text over which this <tt>Normalizer</tt> will iterate.
 395  * The iteration position is set to the beginning of the input text.
 396  */
 397 void
 398 Normalizer::setText(const UnicodeString& newText,
 399                     UErrorCode &status)
 400 {
 401     if (U_FAILURE(status)) {
 402         return;
 403     }
 404     CharacterIterator *newIter = new StringCharacterIterator(newText);
 405     if (newIter == NULL) {
 406         status = U_MEMORY_ALLOCATION_ERROR;
 407         return;
 408     }
 409     delete text;
 410     text = newIter;
 411     reset();
 412 }
 413
 414 /**
 415  * Set the input text over which this <tt>Normalizer</tt> will iterate.
 416  * The iteration position is set to the beginning of the string.
 417  */
 418 void
 419 Normalizer::setText(const CharacterIterator& newText,
 420                     UErrorCode &status)
 421 {
 422     if (U_FAILURE(status)) {
 423         return;
 424     }
 425     CharacterIterator *newIter = newText.clone();
 426     if (newIter == NULL) {
 427         status = U_MEMORY_ALLOCATION_ERROR;
 428         return;
 429     }
 430     delete text;
 431     text = newIter;
 432     reset();
 433 }
 434
 435 void
 436 Normalizer::setText(const UChar* newText,
 437                     int32_t length,
 438                     UErrorCode &status)
 439 {
 440     if (U_FAILURE(status)) {
 441         return;
 442     }
 443     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
 444     if (newIter == NULL) {
 445         status = U_MEMORY_ALLOCATION_ERROR;
 446         return;
 447     }
 448     delete text;
 449     text = newIter;
 450     reset();
 451 }
 452
 453 /**
 454  * Copies the text under iteration into the UnicodeString referred to by "result".
 455  * @param result Receives a copy of the text under iteration.
 456  */
 457 void
 458 Normalizer::getText(UnicodeString&  result)
 459 {
 460     text->getText(result);
 461 }
 462
 463 //-------------------------------------------------------------------------
 464 // Private utility methods
 465 //-------------------------------------------------------------------------
 466
 467 void Normalizer::clearBuffer() {
 468     buffer.remove();
 469     bufferPos=0;
 470 }
 471
 472 UBool
 473 Normalizer::nextNormalize() {
 474     clearBuffer();
 475     currentIndex=nextIndex;
 476     text->setIndex(nextIndex);
 477     if(!text->hasNext()) {
 478         return FALSE;
 479     }
 480     // Skip at least one character so we make progress.
 481     UnicodeString segment(text->next32PostInc());
 482     while(text->hasNext()) {
 483         UChar32 c;
 484         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
 485             text->move32(-1, CharacterIterator::kCurrent);
 486             break;
 487         }
 488         segment.append(c);
 489     }
 490     nextIndex=text->getIndex();
 491     UErrorCode errorCode=U_ZERO_ERROR;
 492     fNorm2->normalize(segment, buffer, errorCode);
 493     return U_SUCCESS(errorCode) && !buffer.isEmpty();
 494 }
 495
 496 UBool
 497 Normalizer::previousNormalize() {
 498     clearBuffer();
 499     nextIndex=currentIndex;
 500     text->setIndex(currentIndex);
 501     if(!text->hasPrevious()) {
 502         return FALSE;
 503     }
 504     UnicodeString segment;
 505     while(text->hasPrevious()) {
 506         UChar32 c=text->previous32();
 507         segment.insert(0, c);
 508         if(fNorm2->hasBoundaryBefore(c)) {
 509             break;
 510         }
 511     }
 512     currentIndex=text->getIndex();
 513     UErrorCode errorCode=U_ZERO_ERROR;
 514     fNorm2->normalize(segment, buffer, errorCode);
 515     bufferPos=buffer.length();
 516     return U_SUCCESS(errorCode) && !buffer.isEmpty();
 517 }
 518
 519 U_NAMESPACE_END
 520
 521 #endif /* #if !UCONFIG_NO_NORMALIZATION */