icuSources/common/normlzr.cpp

   1 /*
   2  *************************************************************************
   3  * COPYRIGHT:
   4  * Copyright (c) 1996-2005, International Business Machines Corporation and
   5  * others. All Rights Reserved.
   6  *************************************************************************
   7  */
   8
   9 #include "unicode/utypes.h"
  10
  11 #if !UCONFIG_NO_NORMALIZATION
  12
  13 #include "unicode/unistr.h"
  14 #include "unicode/chariter.h"
  15 #include "unicode/schriter.h"
  16 #include "unicode/uchriter.h"
  17 #include "unicode/uiter.h"
  18 #include "unicode/normlzr.h"
  19 #include "cmemory.h"
  20 #include "unormimp.h"
  21
  22 U_NAMESPACE_BEGIN
  23
  24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
  25
  26 //-------------------------------------------------------------------------
  27 // Constructors and other boilerplate
  28 //-------------------------------------------------------------------------
  29
  30 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
  31     UObject(), fUMode(mode), fOptions(0),
  32     currentIndex(0), nextIndex(0),
  33     buffer(), bufferPos(0)
  34 {
  35     init(new StringCharacterIterator(str));
  36 }
  37
  38 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
  39     UObject(), fUMode(mode), fOptions(0),
  40     currentIndex(0), nextIndex(0),
  41     buffer(), bufferPos(0)
  42 {
  43     init(new UCharCharacterIterator(str, length));
  44 }
  45
  46 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
  47     UObject(), fUMode(mode), fOptions(0),
  48     currentIndex(0), nextIndex(0),
  49     buffer(), bufferPos(0)
  50 {
  51     init(iter.clone());
  52 }
  53
  54 Normalizer::Normalizer(const Normalizer &copy) :
  55     UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions),
  56     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
  57     buffer(copy.buffer), bufferPos(copy.bufferPos)
  58 {
  59     init(((CharacterIterator *)(copy.text->context))->clone());
  60 }
  61
  62 static const UChar _NUL=0;
  63
  64 void
  65 Normalizer::init(CharacterIterator *iter) {
  66     UErrorCode errorCode=U_ZERO_ERROR;
  67
  68     text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator));
  69     if(text!=NULL) {
  70         if(unorm_haveData(&errorCode)) {
  71             uiter_setCharacterIterator(text, iter);
  72         } else {
  73             delete iter;
  74             uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0));
  75         }
  76     } else {
  77         delete iter;
  78     }
  79 }
  80
  81 Normalizer::~Normalizer()
  82 {
  83     if(text!=NULL) {
  84         delete (CharacterIterator *)text->context;
  85         uprv_free(text);
  86     }
  87 }
  88
  89 Normalizer*
  90 Normalizer::clone() const
  91 {
  92     if(this!=0) {
  93         return new Normalizer(*this);
  94     } else {
  95         return 0;
  96     }
  97 }
  98
  99 /**
 100  * Generates a hash code for this iterator.
 101  */
 102 int32_t Normalizer::hashCode() const
 103 {
 104     return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
 105 }
 106
 107 UBool Normalizer::operator==(const Normalizer& that) const
 108 {
 109     return
 110         this==&that ||
 111         fUMode==that.fUMode &&
 112         fOptions==that.fOptions &&
 113         *((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) &&
 114         buffer==that.buffer &&
 115         bufferPos==that.bufferPos &&
 116         nextIndex==that.nextIndex;
 117 }
 118
 119 //-------------------------------------------------------------------------
 120 // Static utility methods
 121 //-------------------------------------------------------------------------
 122
 123 void U_EXPORT2
 124 Normalizer::normalize(const UnicodeString& source,
 125                       UNormalizationMode mode, int32_t options,
 126                       UnicodeString& result,
 127                       UErrorCode &status) {
 128     if(source.isBogus() || U_FAILURE(status)) {
 129         result.setToBogus();
 130         if(U_SUCCESS(status)) {
 131             status=U_ILLEGAL_ARGUMENT_ERROR;
 132         }
 133     } else {
 134         UnicodeString localDest;
 135         UnicodeString *dest;
 136
 137         if(&source!=&result) {
 138             dest=&result;
 139         } else {
 140             // the source and result strings are the same object, use a temporary one
 141             dest=&localDest;
 142         }
 143
 144         UChar *buffer=dest->getBuffer(source.length());
 145         int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(),
 146                                                source.getBuffer(), source.length(),
 147                                                mode, options,
 148                                                &status);
 149         dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
 150         if(status==U_BUFFER_OVERFLOW_ERROR) {
 151             status=U_ZERO_ERROR;
 152             buffer=dest->getBuffer(length);
 153             length=unorm_internalNormalize(buffer, dest->getCapacity(),
 154                                            source.getBuffer(), source.length(),
 155                                            mode, options,
 156                                            &status);
 157             dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
 158         }
 159
 160         if(dest==&localDest) {
 161             result=*dest;
 162         }
 163         if(U_FAILURE(status)) {
 164             result.setToBogus();
 165         }
 166     }
 167 }
 168
 169 void U_EXPORT2
 170 Normalizer::compose(const UnicodeString& source,
 171                     UBool compat, int32_t options,
 172                     UnicodeString& result,
 173                     UErrorCode &status) {
 174     if(source.isBogus() || U_FAILURE(status)) {
 175         result.setToBogus();
 176         if(U_SUCCESS(status)) {
 177             status=U_ILLEGAL_ARGUMENT_ERROR;
 178         }
 179     } else {
 180         UnicodeString localDest;
 181         UnicodeString *dest;
 182
 183         if(&source!=&result) {
 184             dest=&result;
 185         } else {
 186             // the source and result strings are the same object, use a temporary one
 187             dest=&localDest;
 188         }
 189
 190         UChar *buffer=dest->getBuffer(source.length());
 191         int32_t length=unorm_compose(buffer, dest->getCapacity(),
 192                                      source.getBuffer(), source.length(),
 193                                      compat, options,
 194                                      &status);
 195         dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
 196         if(status==U_BUFFER_OVERFLOW_ERROR) {
 197             status=U_ZERO_ERROR;
 198             buffer=dest->getBuffer(length);
 199             length=unorm_compose(buffer, dest->getCapacity(),
 200                                  source.getBuffer(), source.length(),
 201                                  compat, options,
 202                                  &status);
 203             dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
 204         }
 205
 206         if(dest==&localDest) {
 207             result=*dest;
 208         }
 209         if(U_FAILURE(status)) {
 210             result.setToBogus();
 211         }
 212     }
 213 }
 214
 215 void U_EXPORT2
 216 Normalizer::decompose(const UnicodeString& source,
 217                       UBool compat, int32_t options,
 218                       UnicodeString& result,
 219                       UErrorCode &status) {
 220     if(source.isBogus() || U_FAILURE(status)) {
 221         result.setToBogus();
 222         if(U_SUCCESS(status)) {
 223             status=U_ILLEGAL_ARGUMENT_ERROR;
 224         }
 225     } else {
 226         UnicodeString localDest;
 227         UnicodeString *dest;
 228
 229         if(&source!=&result) {
 230             dest=&result;
 231         } else {
 232             // the source and result strings are the same object, use a temporary one
 233             dest=&localDest;
 234         }
 235
 236         UChar *buffer=dest->getBuffer(source.length());
 237         int32_t length=unorm_decompose(buffer, dest->getCapacity(),
 238                                      source.getBuffer(), source.length(),
 239                                      compat, options,
 240                                      &status);
 241         dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
 242         if(status==U_BUFFER_OVERFLOW_ERROR) {
 243             status=U_ZERO_ERROR;
 244             buffer=dest->getBuffer(length);
 245             length=unorm_decompose(buffer, dest->getCapacity(),
 246                                    source.getBuffer(), source.length(),
 247                                    compat, options,
 248                                    &status);
 249             dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
 250         }
 251
 252         if(dest==&localDest) {
 253             result=*dest;
 254         }
 255         if(U_FAILURE(status)) {
 256             result.setToBogus();
 257         }
 258     }
 259 }
 260
 261 UnicodeString & U_EXPORT2
 262 Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
 263                         UnicodeString &result,
 264                         UNormalizationMode mode, int32_t options,
 265                         UErrorCode &errorCode) {
 266     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
 267         result.setToBogus();
 268         if(U_SUCCESS(errorCode)) {
 269             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 270         }
 271     } else {
 272         UnicodeString localDest;
 273         UnicodeString *dest;
 274
 275         if(&left!=&result && &right!=&result) {
 276             dest=&result;
 277         } else {
 278             // the source and result strings are the same object, use a temporary one
 279             dest=&localDest;
 280         }
 281
 282         UChar *buffer=dest->getBuffer(left.length()+right.length());
 283         int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
 284                                          right.getBuffer(), right.length(),
 285                                          buffer, dest->getCapacity(),
 286                                          mode, options,
 287                                          &errorCode);
 288         dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
 289         if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 290             errorCode=U_ZERO_ERROR;
 291             buffer=dest->getBuffer(length);
 292             int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
 293                                              right.getBuffer(), right.length(),
 294                                              buffer, dest->getCapacity(),
 295                                              mode, options,
 296                                              &errorCode);
 297             dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
 298         }
 299
 300         if(dest==&localDest) {
 301             result=*dest;
 302         }
 303         if(U_FAILURE(errorCode)) {
 304             result.setToBogus();
 305         }
 306     }
 307     return result;
 308 }
 309
 310 //-------------------------------------------------------------------------
 311 // Iteration API
 312 //-------------------------------------------------------------------------
 313
 314 /**
 315  * Return the current character in the normalized text.
 316  */
 317 UChar32 Normalizer::current() {
 318     if(bufferPos<buffer.length() || nextNormalize()) {
 319         return buffer.char32At(bufferPos);
 320     } else {
 321         return DONE;
 322     }
 323 }
 324
 325 /**
 326  * Return the next character in the normalized text and advance
 327  * the iteration position by one.  If the end
 328  * of the text has already been reached, {@link #DONE} is returned.
 329  */
 330 UChar32 Normalizer::next() {
 331     if(bufferPos<buffer.length() ||  nextNormalize()) {
 332         UChar32 c=buffer.char32At(bufferPos);
 333         bufferPos+=UTF_CHAR_LENGTH(c);
 334         return c;
 335     } else {
 336         return DONE;
 337     }
 338 }
 339
 340 /**
 341  * Return the previous character in the normalized text and decrement
 342  * the iteration position by one.  If the beginning
 343  * of the text has already been reached, {@link #DONE} is returned.
 344  */
 345 UChar32 Normalizer::previous() {
 346     if(bufferPos>0 || previousNormalize()) {
 347         UChar32 c=buffer.char32At(bufferPos-1);
 348         bufferPos-=UTF_CHAR_LENGTH(c);
 349         return c;
 350     } else {
 351         return DONE;
 352     }
 353 }
 354
 355 void Normalizer::reset() {
 356     currentIndex=nextIndex=text->move(text, 0, UITER_START);
 357     clearBuffer();
 358 }
 359
 360 void
 361 Normalizer::setIndexOnly(int32_t index) {
 362     currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index
 363     clearBuffer();
 364 }
 365
 366 /**
 367  * Return the first character in the normalized text->  This resets
 368  * the <tt>Normalizer's</tt> position to the beginning of the text->
 369  */
 370 UChar32 Normalizer::first() {
 371     reset();
 372     return next();
 373 }
 374
 375 /**
 376  * Return the last character in the normalized text->  This resets
 377  * the <tt>Normalizer's</tt> position to be just before the
 378  * the input text corresponding to that normalized character.
 379  */
 380 UChar32 Normalizer::last() {
 381     currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT);
 382     clearBuffer();
 383     return previous();
 384 }
 385
 386 /**
 387  * Retrieve the current iteration position in the input text that is
 388  * being normalized.  This method is useful in applications such as
 389  * searching, where you need to be able to determine the position in
 390  * the input text that corresponds to a given normalized output character.
 391  * <p>
 392  * <b>Note:</b> This method sets the position in the <em>input</em>, while
 393  * {@link #next} and {@link #previous} iterate through characters in the
 394  * <em>output</em>.  This means that there is not necessarily a one-to-one
 395  * correspondence between characters returned by <tt>next</tt> and
 396  * <tt>previous</tt> and the indices passed to and returned from
 397  * <tt>setIndex</tt> and {@link #getIndex}.
 398  *
 399  */
 400 int32_t Normalizer::getIndex() const {
 401     if(bufferPos<buffer.length()) {
 402         return currentIndex;
 403     } else {
 404         return nextIndex;
 405     }
 406 }
 407
 408 /**
 409  * Retrieve the index of the start of the input text->  This is the begin index
 410  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
 411  * over which this <tt>Normalizer</tt> is iterating
 412  */
 413 int32_t Normalizer::startIndex() const {
 414     return text->getIndex(text, UITER_START);
 415 }
 416
 417 /**
 418  * Retrieve the index of the end of the input text->  This is the end index
 419  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
 420  * over which this <tt>Normalizer</tt> is iterating
 421  */
 422 int32_t Normalizer::endIndex() const {
 423     return text->getIndex(text, UITER_LIMIT);
 424 }
 425
 426 //-------------------------------------------------------------------------
 427 // Property access methods
 428 //-------------------------------------------------------------------------
 429
 430 void
 431 Normalizer::setMode(UNormalizationMode newMode)
 432 {
 433     fUMode = newMode;
 434 }
 435
 436 UNormalizationMode
 437 Normalizer::getUMode() const
 438 {
 439     return fUMode;
 440 }
 441
 442 void
 443 Normalizer::setOption(int32_t option,
 444                       UBool value)
 445 {
 446     if (value) {
 447         fOptions |= option;
 448     } else {
 449         fOptions &= (~option);
 450     }
 451 }
 452
 453 UBool
 454 Normalizer::getOption(int32_t option) const
 455 {
 456     return (fOptions & option) != 0;
 457 }
 458
 459 /**
 460  * Set the input text over which this <tt>Normalizer</tt> will iterate.
 461  * The iteration position is set to the beginning of the input text->
 462  */
 463 void
 464 Normalizer::setText(const UnicodeString& newText,
 465                     UErrorCode &status)
 466 {
 467     if (U_FAILURE(status)) {
 468         return;
 469     }
 470     CharacterIterator *newIter = new StringCharacterIterator(newText);
 471     if (newIter == NULL) {
 472         status = U_MEMORY_ALLOCATION_ERROR;
 473         return;
 474     }
 475     delete (CharacterIterator *)(text->context);
 476     text->context = newIter;
 477     reset();
 478 }
 479
 480 /**
 481  * Set the input text over which this <tt>Normalizer</tt> will iterate.
 482  * The iteration position is set to the beginning of the string.
 483  */
 484 void
 485 Normalizer::setText(const CharacterIterator& newText,
 486                     UErrorCode &status)
 487 {
 488     if (U_FAILURE(status)) {
 489         return;
 490     }
 491     CharacterIterator *newIter = newText.clone();
 492     if (newIter == NULL) {
 493         status = U_MEMORY_ALLOCATION_ERROR;
 494         return;
 495     }
 496     delete (CharacterIterator *)(text->context);
 497     text->context = newIter;
 498     reset();
 499 }
 500
 501 void
 502 Normalizer::setText(const UChar* newText,
 503                     int32_t length,
 504                     UErrorCode &status)
 505 {
 506     if (U_FAILURE(status)) {
 507         return;
 508     }
 509     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
 510     if (newIter == NULL) {
 511         status = U_MEMORY_ALLOCATION_ERROR;
 512         return;
 513     }
 514     delete (CharacterIterator *)(text->context);
 515     text->context = newIter;
 516     reset();
 517 }
 518
 519 /**
 520  * Copies the text under iteration into the UnicodeString referred to by "result".
 521  * @param result Receives a copy of the text under iteration.
 522  */
 523 void
 524 Normalizer::getText(UnicodeString&  result)
 525 {
 526     ((CharacterIterator *)(text->context))->getText(result);
 527 }
 528
 529 //-------------------------------------------------------------------------
 530 // Private utility methods
 531 //-------------------------------------------------------------------------
 532
 533 void Normalizer::clearBuffer() {
 534     buffer.remove();
 535     bufferPos=0;
 536 }
 537
 538 UBool
 539 Normalizer::nextNormalize() {
 540     UChar *p;
 541     int32_t length;
 542     UErrorCode errorCode;
 543
 544     clearBuffer();
 545     currentIndex=nextIndex;
 546     text->move(text, nextIndex, UITER_ZERO);
 547     if(!text->hasNext(text)) {
 548         return FALSE;
 549     }
 550
 551     errorCode=U_ZERO_ERROR;
 552     p=buffer.getBuffer(-1);
 553     length=unorm_next(text, p, buffer.getCapacity(),
 554                       fUMode, fOptions,
 555                       TRUE, 0,
 556                       &errorCode);
 557     buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
 558     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 559         errorCode=U_ZERO_ERROR;
 560         text->move(text, nextIndex, UITER_ZERO);
 561         p=buffer.getBuffer(length);
 562         length=unorm_next(text, p, buffer.getCapacity(),
 563                           fUMode, fOptions,
 564                           TRUE, 0,
 565                           &errorCode);
 566         buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
 567     }
 568
 569     nextIndex=text->getIndex(text, UITER_CURRENT);
 570     return U_SUCCESS(errorCode) && !buffer.isEmpty();
 571 }
 572
 573 UBool
 574 Normalizer::previousNormalize() {
 575     UChar *p;
 576     int32_t length;
 577     UErrorCode errorCode;
 578
 579     clearBuffer();
 580     nextIndex=currentIndex;
 581     text->move(text, currentIndex, UITER_ZERO);
 582     if(!text->hasPrevious(text)) {
 583         return FALSE;
 584     }
 585
 586     errorCode=U_ZERO_ERROR;
 587     p=buffer.getBuffer(-1);
 588     length=unorm_previous(text, p, buffer.getCapacity(),
 589                           fUMode, fOptions,
 590                           TRUE, 0,
 591                           &errorCode);
 592     buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
 593     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 594         errorCode=U_ZERO_ERROR;
 595         text->move(text, currentIndex, UITER_ZERO);
 596         p=buffer.getBuffer(length);
 597         length=unorm_previous(text, p, buffer.getCapacity(),
 598                               fUMode, fOptions,
 599                               TRUE, 0,
 600                               &errorCode);
 601         buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
 602     }
 603
 604     bufferPos=buffer.length();
 605     currentIndex=text->getIndex(text, UITER_CURRENT);
 606     return U_SUCCESS(errorCode) && !buffer.isEmpty();
 607 }
 608
 609 U_NAMESPACE_END
 610
 611 #endif /* #if !UCONFIG_NO_NORMALIZATION */