icuSources/i18n/nortrans.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2001-2011, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   Date        Name        Description
   9 *   07/03/01    aliu        Creation.
  10 **********************************************************************
  11 */
  12
  13 #include "unicode/utypes.h"
  14
  15 #if !UCONFIG_NO_TRANSLITERATION
  16
  17 #include "unicode/normalizer2.h"
  18 #include "unicode/utf16.h"
  19 #include "cstring.h"
  20 #include "nortrans.h"
  21
  22 U_NAMESPACE_BEGIN
  23
  24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
  25
  26 static inline Transliterator::Token cstrToken(const char *s) {
  27     return Transliterator::pointerToken((void *)s);
  28 }
  29
  30 /**
  31  * System registration hook.
  32  */
  33 void NormalizationTransliterator::registerIDs() {
  34     // In the Token, the byte after the NUL is the UNormalization2Mode.
  35     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
  36                                      _create, cstrToken("nfc\0\0"));
  37     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
  38                                      _create, cstrToken("nfkc\0\0"));
  39     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
  40                                      _create, cstrToken("nfc\0\1"));
  41     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
  42                                      _create, cstrToken("nfkc\0\1"));
  43     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
  44                                      _create, cstrToken("nfc\0\2"));
  45     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
  46                                      _create, cstrToken("nfc\0\3"));
  47     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
  48                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
  49     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
  50                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
  51     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
  52                                             UNICODE_STRING_SIMPLE("NFD"), FALSE);
  53     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
  54                                             UNICODE_STRING_SIMPLE("FCD"), FALSE);
  55 }
  56
  57 /**
  58  * Factory methods
  59  */
  60 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
  61                                                      Token context) {
  62     const char *name = (const char *)context.pointer;
  63     UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
  64     UErrorCode errorCode = U_ZERO_ERROR;
  65     const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
  66     if(U_SUCCESS(errorCode)) {
  67         return new NormalizationTransliterator(ID, *norm2);
  68     } else {
  69         return NULL;
  70     }
  71 }
  72
  73 /**
  74  * Constructs a transliterator.
  75  */
  76 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
  77                                                          const Normalizer2 &norm2) :
  78     Transliterator(id, 0), fNorm2(norm2) {}
  79
  80 /**
  81  * Destructor.
  82  */
  83 NormalizationTransliterator::~NormalizationTransliterator() {
  84 }
  85
  86 /**
  87  * Copy constructor.
  88  */
  89 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
  90     Transliterator(o), fNorm2(o.fNorm2) {}
  91
  92 /**
  93  * Transliterator API.
  94  */
  95 Transliterator* NormalizationTransliterator::clone(void) const {
  96     return new NormalizationTransliterator(*this);
  97 }
  98
  99 /**
 100  * Implements {@link Transliterator#handleTransliterate}.
 101  */
 102 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
 103                                                       UBool isIncremental) const {
 104     // start and limit of the input range
 105     int32_t start = offsets.start;
 106     int32_t limit = offsets.limit;
 107     if(start >= limit) {
 108         return;
 109     }
 110
 111     /*
 112      * Normalize as short chunks at a time as possible even in
 113      * bulk mode, so that styled text is minimally disrupted.
 114      * In incremental mode, a chunk that ends with offsets.limit
 115      * must not be normalized.
 116      *
 117      * If it was known that the input text is not styled, then
 118      * a bulk mode normalization could look like this:
 119
 120     UnicodeString input, normalized;
 121     int32_t length = limit - start;
 122     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
 123     input.releaseBuffer(length);
 124
 125     UErrorCode status = U_ZERO_ERROR;
 126     fNorm2.normalize(input, normalized, status);
 127
 128     text.handleReplaceBetween(start, limit, normalized);
 129
 130     int32_t delta = normalized.length() - length;
 131     offsets.contextLimit += delta;
 132     offsets.limit += delta;
 133     offsets.start = limit + delta;
 134
 135      */
 136     UErrorCode errorCode = U_ZERO_ERROR;
 137     UnicodeString segment;
 138     UnicodeString normalized;
 139     UChar32 c = text.char32At(start);
 140     do {
 141         int32_t prev = start;
 142         // Skip at least one character so we make progress.
 143         // c holds the character at start.
 144         segment.remove();
 145         do {
 146             segment.append(c);
 147             start += U16_LENGTH(c);
 148         } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
 149         if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
 150             // stop in incremental mode when we reach the input limit
 151             // in case there are additional characters that could change the
 152             // normalization result
 153             start=prev;
 154             break;
 155         }
 156         fNorm2.normalize(segment, normalized, errorCode);
 157         if(U_FAILURE(errorCode)) {
 158             break;
 159         }
 160         if(segment != normalized) {
 161             // replace the input chunk with its normalized form
 162             text.handleReplaceBetween(prev, start, normalized);
 163
 164             // update all necessary indexes accordingly
 165             int32_t delta = normalized.length() - (start - prev);
 166             start += delta;
 167             limit += delta;
 168         }
 169     } while(start < limit);
 170
 171     offsets.start = start;
 172     offsets.contextLimit += limit - offsets.limit;
 173     offsets.limit = limit;
 174 }
 175
 176 U_NAMESPACE_END
 177
 178 #endif /* #if !UCONFIG_NO_TRANSLITERATION */