icuSources/i18n/nortrans.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2001-2011, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   07/03/01    aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/normalizer2.h"
  16 #include "unicode/utf16.h"
  17 #include "cstring.h"
  18 #include "nortrans.h"
  19
  20 U_NAMESPACE_BEGIN
  21
  22 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
  23
  24 static inline Transliterator::Token cstrToken(const char *s) {
  25     return Transliterator::pointerToken((void *)s);
  26 }
  27
  28 /**
  29  * System registration hook.
  30  */
  31 void NormalizationTransliterator::registerIDs() {
  32     // In the Token, the byte after the NUL is the UNormalization2Mode.
  33     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
  34                                      _create, cstrToken("nfc\0\0"));
  35     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
  36                                      _create, cstrToken("nfkc\0\0"));
  37     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
  38                                      _create, cstrToken("nfc\0\1"));
  39     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
  40                                      _create, cstrToken("nfkc\0\1"));
  41     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
  42                                      _create, cstrToken("nfc\0\2"));
  43     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
  44                                      _create, cstrToken("nfc\0\3"));
  45     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
  46                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
  47     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
  48                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
  49     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
  50                                             UNICODE_STRING_SIMPLE("NFD"), FALSE);
  51     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
  52                                             UNICODE_STRING_SIMPLE("FCD"), FALSE);
  53 }
  54
  55 /**
  56  * Factory methods
  57  */
  58 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
  59                                                      Token context) {
  60     const char *name = (const char *)context.pointer;
  61     UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
  62     UErrorCode errorCode = U_ZERO_ERROR;
  63     const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
  64     if(U_SUCCESS(errorCode)) {
  65         return new NormalizationTransliterator(ID, *norm2);
  66     } else {
  67         return NULL;
  68     }
  69 }
  70
  71 /**
  72  * Constructs a transliterator.
  73  */
  74 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
  75                                                          const Normalizer2 &norm2) :
  76     Transliterator(id, 0), fNorm2(norm2) {}
  77
  78 /**
  79  * Destructor.
  80  */
  81 NormalizationTransliterator::~NormalizationTransliterator() {
  82 }
  83
  84 /**
  85  * Copy constructor.
  86  */
  87 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
  88     Transliterator(o), fNorm2(o.fNorm2) {}
  89
  90 /**
  91  * Transliterator API.
  92  */
  93 Transliterator* NormalizationTransliterator::clone(void) const {
  94     return new NormalizationTransliterator(*this);
  95 }
  96
  97 /**
  98  * Implements {@link Transliterator#handleTransliterate}.
  99  */
 100 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
 101                                                       UBool isIncremental) const {
 102     // start and limit of the input range
 103     int32_t start = offsets.start;
 104     int32_t limit = offsets.limit;
 105     if(start >= limit) {
 106         return;
 107     }
 108
 109     /*
 110      * Normalize as short chunks at a time as possible even in
 111      * bulk mode, so that styled text is minimally disrupted.
 112      * In incremental mode, a chunk that ends with offsets.limit
 113      * must not be normalized.
 114      *
 115      * If it was known that the input text is not styled, then
 116      * a bulk mode normalization could look like this:
 117
 118     UnicodeString input, normalized;
 119     int32_t length = limit - start;
 120     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
 121     input.releaseBuffer(length);
 122
 123     UErrorCode status = U_ZERO_ERROR;
 124     fNorm2.normalize(input, normalized, status);
 125
 126     text.handleReplaceBetween(start, limit, normalized);
 127
 128     int32_t delta = normalized.length() - length;
 129     offsets.contextLimit += delta;
 130     offsets.limit += delta;
 131     offsets.start = limit + delta;
 132
 133      */
 134     UErrorCode errorCode = U_ZERO_ERROR;
 135     UnicodeString segment;
 136     UnicodeString normalized;
 137     UChar32 c = text.char32At(start);
 138     do {
 139         int32_t prev = start;
 140         // Skip at least one character so we make progress.
 141         // c holds the character at start.
 142         segment.remove();
 143         do {
 144             segment.append(c);
 145             start += U16_LENGTH(c);
 146         } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
 147         if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
 148             // stop in incremental mode when we reach the input limit
 149             // in case there are additional characters that could change the
 150             // normalization result
 151             start=prev;
 152             break;
 153         }
 154         fNorm2.normalize(segment, normalized, errorCode);
 155         if(U_FAILURE(errorCode)) {
 156             break;
 157         }
 158         if(segment != normalized) {
 159             // replace the input chunk with its normalized form
 160             text.handleReplaceBetween(prev, start, normalized);
 161
 162             // update all necessary indexes accordingly
 163             int32_t delta = normalized.length() - (start - prev);
 164             start += delta;
 165             limit += delta;
 166         }
 167     } while(start < limit);
 168
 169     offsets.start = start;
 170     offsets.contextLimit += limit - offsets.limit;
 171     offsets.limit = limit;
 172 }
 173
 174 U_NAMESPACE_END
 175
 176 #endif /* #if !UCONFIG_NO_TRANSLITERATION */