2 **********************************************************************
3 * Copyright (C) 2001-2003, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 07/03/01 aliu Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_TRANSLITERATION
15 #include "unicode/uniset.h"
16 #include "unicode/uiter.h"
24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator
)
27 * System registration hook.
29 void NormalizationTransliterator::registerIDs() {
30 UErrorCode errorCode
= U_ZERO_ERROR
;
31 if(!unorm_haveData(&errorCode
)) {
35 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
36 _create
, integerToken(UNORM_NFC
));
37 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
38 _create
, integerToken(UNORM_NFKC
));
39 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
40 _create
, integerToken(UNORM_NFD
));
41 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
42 _create
, integerToken(UNORM_NFKD
));
43 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
44 UNICODE_STRING_SIMPLE("NFD"), TRUE
);
45 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
46 UNICODE_STRING_SIMPLE("NFKD"), TRUE
);
52 Transliterator
* NormalizationTransliterator::_create(const UnicodeString
& ID
,
54 return new NormalizationTransliterator(ID
, (UNormalizationMode
) context
.integer
, 0);
58 * Constructs a transliterator.
60 NormalizationTransliterator::NormalizationTransliterator(
61 const UnicodeString
& id
,
62 UNormalizationMode mode
, int32_t opt
) :
63 Transliterator(id
, 0) {
71 NormalizationTransliterator::~NormalizationTransliterator() {
77 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator
& o
) :
84 * Assignment operator.
86 NormalizationTransliterator
& NormalizationTransliterator::operator=(const NormalizationTransliterator
& o
) {
87 Transliterator::operator=(o
);
96 Transliterator
* NormalizationTransliterator::clone(void) const {
97 return new NormalizationTransliterator(*this);
101 * Implements {@link Transliterator#handleTransliterate}.
103 void NormalizationTransliterator::handleTransliterate(Replaceable
& text
, UTransPosition
& offsets
,
104 UBool isIncremental
) const {
105 // start and limit of the input range
106 int32_t start
= offsets
.start
;
107 int32_t limit
= offsets
.limit
;
108 int32_t length
, delta
;
114 // a C code unit iterator, implemented around the Replaceable
116 uiter_setReplaceable(&iter
, &text
);
118 // the output string and buffer pointer
119 UnicodeString output
;
121 UBool neededToNormalize
;
123 UErrorCode errorCode
;
126 * Normalize as short chunks at a time as possible even in
127 * bulk mode, so that styled text is minimally disrupted.
128 * In incremental mode, a chunk that ends with offsets.limit
129 * must not be normalized.
131 * If it was known that the input text is not styled, then
132 * a bulk mode normalization could look like this:
135 UChar staticChars[256];
138 length = limit - start;
139 input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
141 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
142 input.releaseBuffer(length);
144 UErrorCode status = U_ZERO_ERROR;
145 Normalizer::normalize(input, fMode, options, output, status);
147 text.handleReplaceBetween(start, limit, output);
149 int32_t delta = output.length() - length;
150 offsets.contextLimit += delta;
151 offsets.limit += delta;
152 offsets.start = limit + delta;
156 while(start
< limit
) {
157 // set the iterator limits for the remaining input range
158 // this is a moving target because of the replacements in the text object
159 iter
.start
= iter
.index
= start
;
162 // incrementally normalize a small chunk of the input
163 buffer
= output
.getBuffer(-1);
164 errorCode
= U_ZERO_ERROR
;
165 length
= unorm_next(&iter
, buffer
, output
.getCapacity(),
167 TRUE
, &neededToNormalize
,
169 output
.releaseBuffer(length
);
171 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
172 // use a larger output string buffer and do it again from the start
174 buffer
= output
.getBuffer(length
);
175 errorCode
= U_ZERO_ERROR
;
176 length
= unorm_next(&iter
, buffer
, output
.getCapacity(),
178 TRUE
, &neededToNormalize
,
180 output
.releaseBuffer(length
);
183 if(U_FAILURE(errorCode
)) {
188 if(isIncremental
&& limit
== iter
.limit
) {
189 // stop in incremental mode when we reach the input limit
190 // in case there are additional characters that could change the
191 // normalization result
193 // UNLESS all characters in the result of the normalization of
194 // the last run are in the skippable set
195 const UChar
*s
=output
.getBuffer();
196 int32_t i
=0, outLength
=output
.length();
200 U16_NEXT(s
, i
, outLength
, c
);
201 if(!unorm_isNFSkippable(c
, fMode
)) {
202 outLength
=-1; // I wish C++ had labeled loops and break outer; ...
211 if(neededToNormalize
) {
212 // replace the input chunk with its normalized form
213 text
.handleReplaceBetween(start
, limit
, output
);
215 // update all necessary indexes accordingly
216 delta
= length
- (limit
- start
); // length change in the text object
217 start
= limit
+= delta
; // the next chunk starts where this one ends, with adjustment
218 limit
= offsets
.limit
+= delta
; // set the iteration limit to the adjusted end of the input range
219 offsets
.contextLimit
+= delta
;
223 limit
= offsets
.limit
;
227 offsets
.start
= start
;
232 #endif /* #if !UCONFIG_NO_TRANSLITERATION */