icuSources/i18n/nortrans.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2001-2003, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   07/03/01    aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/uniset.h"
  16 #include "unicode/uiter.h"
  17 #include "nortrans.h"
  18 #include "unormimp.h"
  19 #include "mutex.h"
  20 #include "ucln_in.h"
  21
  22 U_NAMESPACE_BEGIN
  23
  24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
  25
  26 /**
  27  * System registration hook.
  28  */
  29 void NormalizationTransliterator::registerIDs() {
  30     UErrorCode errorCode = U_ZERO_ERROR;
  31     if(!unorm_haveData(&errorCode)) {
  32         return;
  33     }
  34
  35     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
  36                                      _create, integerToken(UNORM_NFC));
  37     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
  38                                      _create, integerToken(UNORM_NFKC));
  39     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
  40                                      _create, integerToken(UNORM_NFD));
  41     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
  42                                      _create, integerToken(UNORM_NFKD));
  43     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
  44                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
  45     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
  46                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
  47 }
  48
  49 /**
  50  * Factory methods
  51  */
  52 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
  53                                                      Token context) {
  54     return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
  55 }
  56
  57 /**
  58  * Constructs a transliterator.
  59  */
  60 NormalizationTransliterator::NormalizationTransliterator(
  61                                  const UnicodeString& id,
  62                                  UNormalizationMode mode, int32_t opt) :
  63     Transliterator(id, 0) {
  64     fMode = mode;
  65     options = opt;
  66 }
  67
  68 /**
  69  * Destructor.
  70  */
  71 NormalizationTransliterator::~NormalizationTransliterator() {
  72 }
  73
  74 /**
  75  * Copy constructor.
  76  */
  77 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
  78 Transliterator(o) {
  79     fMode = o.fMode;
  80     options = o.options;
  81 }
  82
  83 /**
  84  * Assignment operator.
  85  */
  86 NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
  87     Transliterator::operator=(o);
  88     fMode = o.fMode;
  89     options = o.options;
  90     return *this;
  91 }
  92
  93 /**
  94  * Transliterator API.
  95  */
  96 Transliterator* NormalizationTransliterator::clone(void) const {
  97     return new NormalizationTransliterator(*this);
  98 }
  99
 100 /**
 101  * Implements {@link Transliterator#handleTransliterate}.
 102  */
 103 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
 104                                                       UBool isIncremental) const {
 105     // start and limit of the input range
 106     int32_t start = offsets.start;
 107     int32_t limit = offsets.limit;
 108     int32_t length, delta;
 109
 110     if(start >= limit) {
 111         return;
 112     }
 113
 114     // a C code unit iterator, implemented around the Replaceable
 115     UCharIterator iter;
 116     uiter_setReplaceable(&iter, &text);
 117
 118     // the output string and buffer pointer
 119     UnicodeString output;
 120     UChar *buffer;
 121     UBool neededToNormalize;
 122
 123     UErrorCode errorCode;
 124
 125     /*
 126      * Normalize as short chunks at a time as possible even in
 127      * bulk mode, so that styled text is minimally disrupted.
 128      * In incremental mode, a chunk that ends with offsets.limit
 129      * must not be normalized.
 130      *
 131      * If it was known that the input text is not styled, then
 132      * a bulk mode normalization could look like this:
 133      *
 134
 135     UChar staticChars[256];
 136     UnicodeString input;
 137
 138     length = limit - start;
 139     input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
 140
 141     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
 142     input.releaseBuffer(length);
 143
 144     UErrorCode status = U_ZERO_ERROR;
 145     Normalizer::normalize(input, fMode, options, output, status);
 146
 147     text.handleReplaceBetween(start, limit, output);
 148
 149     int32_t delta = output.length() - length;
 150     offsets.contextLimit += delta;
 151     offsets.limit += delta;
 152     offsets.start = limit + delta;
 153
 154      *
 155      */
 156     while(start < limit) {
 157         // set the iterator limits for the remaining input range
 158         // this is a moving target because of the replacements in the text object
 159         iter.start = iter.index = start;
 160         iter.limit = limit;
 161
 162         // incrementally normalize a small chunk of the input
 163         buffer = output.getBuffer(-1);
 164         errorCode = U_ZERO_ERROR;
 165         length = unorm_next(&iter, buffer, output.getCapacity(),
 166                             fMode, 0,
 167                             TRUE, &neededToNormalize,
 168                             &errorCode);
 169         output.releaseBuffer(length);
 170
 171         if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 172             // use a larger output string buffer and do it again from the start
 173             iter.index = start;
 174             buffer = output.getBuffer(length);
 175             errorCode = U_ZERO_ERROR;
 176             length = unorm_next(&iter, buffer, output.getCapacity(),
 177                                 fMode, 0,
 178                                 TRUE, &neededToNormalize,
 179                                 &errorCode);
 180             output.releaseBuffer(length);
 181         }
 182
 183         if(U_FAILURE(errorCode)) {
 184             break;
 185         }
 186
 187         limit = iter.index;
 188         if(isIncremental && limit == iter.limit) {
 189             // stop in incremental mode when we reach the input limit
 190             // in case there are additional characters that could change the
 191             // normalization result
 192
 193             // UNLESS all characters in the result of the normalization of
 194             // the last run are in the skippable set
 195             const UChar *s=output.getBuffer();
 196             int32_t i=0, outLength=output.length();
 197             UChar32 c;
 198
 199             while(i<outLength) {
 200                 U16_NEXT(s, i, outLength, c);
 201                 if(!unorm_isNFSkippable(c, fMode)) {
 202                     outLength=-1; // I wish C++ had labeled loops and break outer; ...
 203                     break;
 204                 }
 205             }
 206             if (outLength<0) {
 207                 break;
 208             }
 209         }
 210
 211         if(neededToNormalize) {
 212             // replace the input chunk with its normalized form
 213             text.handleReplaceBetween(start, limit, output);
 214
 215             // update all necessary indexes accordingly
 216             delta = length - (limit - start);   // length change in the text object
 217             start = limit += delta;             // the next chunk starts where this one ends, with adjustment
 218             limit = offsets.limit += delta;     // set the iteration limit to the adjusted end of the input range
 219             offsets.contextLimit += delta;
 220         } else {
 221             // delta == 0
 222             start = limit;
 223             limit = offsets.limit;
 224         }
 225     }
 226
 227     offsets.start = start;
 228 }
 229
 230 U_NAMESPACE_END
 231
 232 #endif /* #if !UCONFIG_NO_TRANSLITERATION */