]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
46f4442e | 3 | * Copyright (C) 2001-2007, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * Date Name Description | |
7 | * 07/03/01 aliu Creation. | |
8 | ********************************************************************** | |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_TRANSLITERATION | |
14 | ||
15 | #include "unicode/uniset.h" | |
16 | #include "unicode/uiter.h" | |
17 | #include "nortrans.h" | |
18 | #include "unormimp.h" | |
b75a7d8f A |
19 | #include "ucln_in.h" |
20 | ||
21 | U_NAMESPACE_BEGIN | |
22 | ||
374ca955 | 23 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) |
b75a7d8f A |
24 | |
25 | /** | |
26 | * System registration hook. | |
27 | */ | |
28 | void NormalizationTransliterator::registerIDs() { | |
29 | UErrorCode errorCode = U_ZERO_ERROR; | |
30 | if(!unorm_haveData(&errorCode)) { | |
31 | return; | |
32 | } | |
33 | ||
34 | Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), | |
35 | _create, integerToken(UNORM_NFC)); | |
36 | Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), | |
37 | _create, integerToken(UNORM_NFKC)); | |
38 | Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), | |
39 | _create, integerToken(UNORM_NFD)); | |
40 | Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), | |
41 | _create, integerToken(UNORM_NFKD)); | |
42 | Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), | |
43 | UNICODE_STRING_SIMPLE("NFD"), TRUE); | |
44 | Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), | |
45 | UNICODE_STRING_SIMPLE("NFKD"), TRUE); | |
46 | } | |
47 | ||
48 | /** | |
49 | * Factory methods | |
50 | */ | |
51 | Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID, | |
52 | Token context) { | |
53 | return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0); | |
54 | } | |
55 | ||
56 | /** | |
57 | * Constructs a transliterator. | |
58 | */ | |
59 | NormalizationTransliterator::NormalizationTransliterator( | |
60 | const UnicodeString& id, | |
61 | UNormalizationMode mode, int32_t opt) : | |
62 | Transliterator(id, 0) { | |
63 | fMode = mode; | |
64 | options = opt; | |
65 | } | |
66 | ||
67 | /** | |
68 | * Destructor. | |
69 | */ | |
70 | NormalizationTransliterator::~NormalizationTransliterator() { | |
71 | } | |
72 | ||
73 | /** | |
74 | * Copy constructor. | |
75 | */ | |
76 | NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : | |
77 | Transliterator(o) { | |
78 | fMode = o.fMode; | |
79 | options = o.options; | |
80 | } | |
81 | ||
82 | /** | |
83 | * Assignment operator. | |
84 | */ | |
46f4442e | 85 | /*NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) { |
b75a7d8f A |
86 | Transliterator::operator=(o); |
87 | fMode = o.fMode; | |
88 | options = o.options; | |
89 | return *this; | |
46f4442e | 90 | }*/ |
b75a7d8f A |
91 | |
92 | /** | |
93 | * Transliterator API. | |
94 | */ | |
95 | Transliterator* NormalizationTransliterator::clone(void) const { | |
96 | return new NormalizationTransliterator(*this); | |
97 | } | |
98 | ||
99 | /** | |
100 | * Implements {@link Transliterator#handleTransliterate}. | |
101 | */ | |
102 | void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, | |
103 | UBool isIncremental) const { | |
104 | // start and limit of the input range | |
105 | int32_t start = offsets.start; | |
106 | int32_t limit = offsets.limit; | |
107 | int32_t length, delta; | |
108 | ||
109 | if(start >= limit) { | |
110 | return; | |
111 | } | |
112 | ||
113 | // a C code unit iterator, implemented around the Replaceable | |
114 | UCharIterator iter; | |
115 | uiter_setReplaceable(&iter, &text); | |
116 | ||
117 | // the output string and buffer pointer | |
118 | UnicodeString output; | |
119 | UChar *buffer; | |
120 | UBool neededToNormalize; | |
121 | ||
122 | UErrorCode errorCode; | |
123 | ||
124 | /* | |
125 | * Normalize as short chunks at a time as possible even in | |
126 | * bulk mode, so that styled text is minimally disrupted. | |
127 | * In incremental mode, a chunk that ends with offsets.limit | |
128 | * must not be normalized. | |
129 | * | |
130 | * If it was known that the input text is not styled, then | |
131 | * a bulk mode normalization could look like this: | |
132 | * | |
133 | ||
134 | UChar staticChars[256]; | |
135 | UnicodeString input; | |
136 | ||
137 | length = limit - start; | |
138 | input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias | |
139 | ||
140 | _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); | |
141 | input.releaseBuffer(length); | |
142 | ||
143 | UErrorCode status = U_ZERO_ERROR; | |
144 | Normalizer::normalize(input, fMode, options, output, status); | |
145 | ||
146 | text.handleReplaceBetween(start, limit, output); | |
147 | ||
148 | int32_t delta = output.length() - length; | |
149 | offsets.contextLimit += delta; | |
150 | offsets.limit += delta; | |
151 | offsets.start = limit + delta; | |
152 | ||
153 | * | |
154 | */ | |
155 | while(start < limit) { | |
156 | // set the iterator limits for the remaining input range | |
157 | // this is a moving target because of the replacements in the text object | |
158 | iter.start = iter.index = start; | |
159 | iter.limit = limit; | |
160 | ||
161 | // incrementally normalize a small chunk of the input | |
162 | buffer = output.getBuffer(-1); | |
163 | errorCode = U_ZERO_ERROR; | |
164 | length = unorm_next(&iter, buffer, output.getCapacity(), | |
165 | fMode, 0, | |
166 | TRUE, &neededToNormalize, | |
167 | &errorCode); | |
73c04bcf | 168 | output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); |
b75a7d8f A |
169 | |
170 | if(errorCode == U_BUFFER_OVERFLOW_ERROR) { | |
171 | // use a larger output string buffer and do it again from the start | |
172 | iter.index = start; | |
173 | buffer = output.getBuffer(length); | |
174 | errorCode = U_ZERO_ERROR; | |
175 | length = unorm_next(&iter, buffer, output.getCapacity(), | |
176 | fMode, 0, | |
177 | TRUE, &neededToNormalize, | |
178 | &errorCode); | |
73c04bcf | 179 | output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); |
b75a7d8f A |
180 | } |
181 | ||
182 | if(U_FAILURE(errorCode)) { | |
183 | break; | |
184 | } | |
185 | ||
186 | limit = iter.index; | |
187 | if(isIncremental && limit == iter.limit) { | |
188 | // stop in incremental mode when we reach the input limit | |
189 | // in case there are additional characters that could change the | |
190 | // normalization result | |
191 | ||
192 | // UNLESS all characters in the result of the normalization of | |
193 | // the last run are in the skippable set | |
194 | const UChar *s=output.getBuffer(); | |
195 | int32_t i=0, outLength=output.length(); | |
196 | UChar32 c; | |
197 | ||
198 | while(i<outLength) { | |
199 | U16_NEXT(s, i, outLength, c); | |
200 | if(!unorm_isNFSkippable(c, fMode)) { | |
201 | outLength=-1; // I wish C++ had labeled loops and break outer; ... | |
202 | break; | |
203 | } | |
204 | } | |
205 | if (outLength<0) { | |
206 | break; | |
207 | } | |
208 | } | |
209 | ||
210 | if(neededToNormalize) { | |
211 | // replace the input chunk with its normalized form | |
212 | text.handleReplaceBetween(start, limit, output); | |
213 | ||
214 | // update all necessary indexes accordingly | |
215 | delta = length - (limit - start); // length change in the text object | |
216 | start = limit += delta; // the next chunk starts where this one ends, with adjustment | |
217 | limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range | |
218 | offsets.contextLimit += delta; | |
219 | } else { | |
220 | // delta == 0 | |
221 | start = limit; | |
222 | limit = offsets.limit; | |
223 | } | |
224 | } | |
225 | ||
226 | offsets.start = start; | |
227 | } | |
228 | ||
229 | U_NAMESPACE_END | |
230 | ||
231 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |