]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/nortrans.cpp
ICU-6.2.14.tar.gz
[apple/icu.git] / icuSources / i18n / nortrans.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2001-2003, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 07/03/01 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uniset.h"
16 #include "unicode/uiter.h"
17 #include "nortrans.h"
18 #include "unormimp.h"
19 #include "mutex.h"
20 #include "ucln_in.h"
21
22 U_NAMESPACE_BEGIN
23
24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
25
26 /**
27 * System registration hook.
28 */
29 void NormalizationTransliterator::registerIDs() {
30 UErrorCode errorCode = U_ZERO_ERROR;
31 if(!unorm_haveData(&errorCode)) {
32 return;
33 }
34
35 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
36 _create, integerToken(UNORM_NFC));
37 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
38 _create, integerToken(UNORM_NFKC));
39 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
40 _create, integerToken(UNORM_NFD));
41 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
42 _create, integerToken(UNORM_NFKD));
43 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
44 UNICODE_STRING_SIMPLE("NFD"), TRUE);
45 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
46 UNICODE_STRING_SIMPLE("NFKD"), TRUE);
47 }
48
49 /**
50 * Factory methods
51 */
52 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
53 Token context) {
54 return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
55 }
56
57 /**
58 * Constructs a transliterator.
59 */
60 NormalizationTransliterator::NormalizationTransliterator(
61 const UnicodeString& id,
62 UNormalizationMode mode, int32_t opt) :
63 Transliterator(id, 0) {
64 fMode = mode;
65 options = opt;
66 }
67
68 /**
69 * Destructor.
70 */
71 NormalizationTransliterator::~NormalizationTransliterator() {
72 }
73
74 /**
75 * Copy constructor.
76 */
77 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
78 Transliterator(o) {
79 fMode = o.fMode;
80 options = o.options;
81 }
82
83 /**
84 * Assignment operator.
85 */
86 NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
87 Transliterator::operator=(o);
88 fMode = o.fMode;
89 options = o.options;
90 return *this;
91 }
92
93 /**
94 * Transliterator API.
95 */
96 Transliterator* NormalizationTransliterator::clone(void) const {
97 return new NormalizationTransliterator(*this);
98 }
99
100 /**
101 * Implements {@link Transliterator#handleTransliterate}.
102 */
103 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
104 UBool isIncremental) const {
105 // start and limit of the input range
106 int32_t start = offsets.start;
107 int32_t limit = offsets.limit;
108 int32_t length, delta;
109
110 if(start >= limit) {
111 return;
112 }
113
114 // a C code unit iterator, implemented around the Replaceable
115 UCharIterator iter;
116 uiter_setReplaceable(&iter, &text);
117
118 // the output string and buffer pointer
119 UnicodeString output;
120 UChar *buffer;
121 UBool neededToNormalize;
122
123 UErrorCode errorCode;
124
125 /*
126 * Normalize as short chunks at a time as possible even in
127 * bulk mode, so that styled text is minimally disrupted.
128 * In incremental mode, a chunk that ends with offsets.limit
129 * must not be normalized.
130 *
131 * If it was known that the input text is not styled, then
132 * a bulk mode normalization could look like this:
133 *
134
135 UChar staticChars[256];
136 UnicodeString input;
137
138 length = limit - start;
139 input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
140
141 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
142 input.releaseBuffer(length);
143
144 UErrorCode status = U_ZERO_ERROR;
145 Normalizer::normalize(input, fMode, options, output, status);
146
147 text.handleReplaceBetween(start, limit, output);
148
149 int32_t delta = output.length() - length;
150 offsets.contextLimit += delta;
151 offsets.limit += delta;
152 offsets.start = limit + delta;
153
154 *
155 */
156 while(start < limit) {
157 // set the iterator limits for the remaining input range
158 // this is a moving target because of the replacements in the text object
159 iter.start = iter.index = start;
160 iter.limit = limit;
161
162 // incrementally normalize a small chunk of the input
163 buffer = output.getBuffer(-1);
164 errorCode = U_ZERO_ERROR;
165 length = unorm_next(&iter, buffer, output.getCapacity(),
166 fMode, 0,
167 TRUE, &neededToNormalize,
168 &errorCode);
169 output.releaseBuffer(length);
170
171 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
172 // use a larger output string buffer and do it again from the start
173 iter.index = start;
174 buffer = output.getBuffer(length);
175 errorCode = U_ZERO_ERROR;
176 length = unorm_next(&iter, buffer, output.getCapacity(),
177 fMode, 0,
178 TRUE, &neededToNormalize,
179 &errorCode);
180 output.releaseBuffer(length);
181 }
182
183 if(U_FAILURE(errorCode)) {
184 break;
185 }
186
187 limit = iter.index;
188 if(isIncremental && limit == iter.limit) {
189 // stop in incremental mode when we reach the input limit
190 // in case there are additional characters that could change the
191 // normalization result
192
193 // UNLESS all characters in the result of the normalization of
194 // the last run are in the skippable set
195 const UChar *s=output.getBuffer();
196 int32_t i=0, outLength=output.length();
197 UChar32 c;
198
199 while(i<outLength) {
200 U16_NEXT(s, i, outLength, c);
201 if(!unorm_isNFSkippable(c, fMode)) {
202 outLength=-1; // I wish C++ had labeled loops and break outer; ...
203 break;
204 }
205 }
206 if (outLength<0) {
207 break;
208 }
209 }
210
211 if(neededToNormalize) {
212 // replace the input chunk with its normalized form
213 text.handleReplaceBetween(start, limit, output);
214
215 // update all necessary indexes accordingly
216 delta = length - (limit - start); // length change in the text object
217 start = limit += delta; // the next chunk starts where this one ends, with adjustment
218 limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
219 offsets.contextLimit += delta;
220 } else {
221 // delta == 0
222 start = limit;
223 limit = offsets.limit;
224 }
225 }
226
227 offsets.start = start;
228 }
229
230 U_NAMESPACE_END
231
232 #endif /* #if !UCONFIG_NO_TRANSLITERATION */