]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
73c04bcf | 3 | * Copyright (C) 2001-2005, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * Date Name Description | |
7 | * 07/03/01 aliu Creation. | |
8 | ********************************************************************** | |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_TRANSLITERATION | |
14 | ||
15 | #include "unicode/uniset.h" | |
16 | #include "unicode/uiter.h" | |
17 | #include "nortrans.h" | |
18 | #include "unormimp.h" | |
19 | #include "mutex.h" | |
20 | #include "ucln_in.h" | |
21 | ||
22 | U_NAMESPACE_BEGIN | |
23 | ||
374ca955 | 24 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) |
b75a7d8f A |
25 | |
26 | /** | |
27 | * System registration hook. | |
28 | */ | |
29 | void NormalizationTransliterator::registerIDs() { | |
30 | UErrorCode errorCode = U_ZERO_ERROR; | |
31 | if(!unorm_haveData(&errorCode)) { | |
32 | return; | |
33 | } | |
34 | ||
35 | Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), | |
36 | _create, integerToken(UNORM_NFC)); | |
37 | Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), | |
38 | _create, integerToken(UNORM_NFKC)); | |
39 | Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), | |
40 | _create, integerToken(UNORM_NFD)); | |
41 | Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), | |
42 | _create, integerToken(UNORM_NFKD)); | |
43 | Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), | |
44 | UNICODE_STRING_SIMPLE("NFD"), TRUE); | |
45 | Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), | |
46 | UNICODE_STRING_SIMPLE("NFKD"), TRUE); | |
47 | } | |
48 | ||
49 | /** | |
50 | * Factory methods | |
51 | */ | |
52 | Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID, | |
53 | Token context) { | |
54 | return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0); | |
55 | } | |
56 | ||
57 | /** | |
58 | * Constructs a transliterator. | |
59 | */ | |
60 | NormalizationTransliterator::NormalizationTransliterator( | |
61 | const UnicodeString& id, | |
62 | UNormalizationMode mode, int32_t opt) : | |
63 | Transliterator(id, 0) { | |
64 | fMode = mode; | |
65 | options = opt; | |
66 | } | |
67 | ||
68 | /** | |
69 | * Destructor. | |
70 | */ | |
71 | NormalizationTransliterator::~NormalizationTransliterator() { | |
72 | } | |
73 | ||
74 | /** | |
75 | * Copy constructor. | |
76 | */ | |
77 | NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : | |
78 | Transliterator(o) { | |
79 | fMode = o.fMode; | |
80 | options = o.options; | |
81 | } | |
82 | ||
83 | /** | |
84 | * Assignment operator. | |
85 | */ | |
86 | NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) { | |
87 | Transliterator::operator=(o); | |
88 | fMode = o.fMode; | |
89 | options = o.options; | |
90 | return *this; | |
91 | } | |
92 | ||
93 | /** | |
94 | * Transliterator API. | |
95 | */ | |
96 | Transliterator* NormalizationTransliterator::clone(void) const { | |
97 | return new NormalizationTransliterator(*this); | |
98 | } | |
99 | ||
100 | /** | |
101 | * Implements {@link Transliterator#handleTransliterate}. | |
102 | */ | |
103 | void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, | |
104 | UBool isIncremental) const { | |
105 | // start and limit of the input range | |
106 | int32_t start = offsets.start; | |
107 | int32_t limit = offsets.limit; | |
108 | int32_t length, delta; | |
109 | ||
110 | if(start >= limit) { | |
111 | return; | |
112 | } | |
113 | ||
114 | // a C code unit iterator, implemented around the Replaceable | |
115 | UCharIterator iter; | |
116 | uiter_setReplaceable(&iter, &text); | |
117 | ||
118 | // the output string and buffer pointer | |
119 | UnicodeString output; | |
120 | UChar *buffer; | |
121 | UBool neededToNormalize; | |
122 | ||
123 | UErrorCode errorCode; | |
124 | ||
125 | /* | |
126 | * Normalize as short chunks at a time as possible even in | |
127 | * bulk mode, so that styled text is minimally disrupted. | |
128 | * In incremental mode, a chunk that ends with offsets.limit | |
129 | * must not be normalized. | |
130 | * | |
131 | * If it was known that the input text is not styled, then | |
132 | * a bulk mode normalization could look like this: | |
133 | * | |
134 | ||
135 | UChar staticChars[256]; | |
136 | UnicodeString input; | |
137 | ||
138 | length = limit - start; | |
139 | input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias | |
140 | ||
141 | _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); | |
142 | input.releaseBuffer(length); | |
143 | ||
144 | UErrorCode status = U_ZERO_ERROR; | |
145 | Normalizer::normalize(input, fMode, options, output, status); | |
146 | ||
147 | text.handleReplaceBetween(start, limit, output); | |
148 | ||
149 | int32_t delta = output.length() - length; | |
150 | offsets.contextLimit += delta; | |
151 | offsets.limit += delta; | |
152 | offsets.start = limit + delta; | |
153 | ||
154 | * | |
155 | */ | |
156 | while(start < limit) { | |
157 | // set the iterator limits for the remaining input range | |
158 | // this is a moving target because of the replacements in the text object | |
159 | iter.start = iter.index = start; | |
160 | iter.limit = limit; | |
161 | ||
162 | // incrementally normalize a small chunk of the input | |
163 | buffer = output.getBuffer(-1); | |
164 | errorCode = U_ZERO_ERROR; | |
165 | length = unorm_next(&iter, buffer, output.getCapacity(), | |
166 | fMode, 0, | |
167 | TRUE, &neededToNormalize, | |
168 | &errorCode); | |
73c04bcf | 169 | output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); |
b75a7d8f A |
170 | |
171 | if(errorCode == U_BUFFER_OVERFLOW_ERROR) { | |
172 | // use a larger output string buffer and do it again from the start | |
173 | iter.index = start; | |
174 | buffer = output.getBuffer(length); | |
175 | errorCode = U_ZERO_ERROR; | |
176 | length = unorm_next(&iter, buffer, output.getCapacity(), | |
177 | fMode, 0, | |
178 | TRUE, &neededToNormalize, | |
179 | &errorCode); | |
73c04bcf | 180 | output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); |
b75a7d8f A |
181 | } |
182 | ||
183 | if(U_FAILURE(errorCode)) { | |
184 | break; | |
185 | } | |
186 | ||
187 | limit = iter.index; | |
188 | if(isIncremental && limit == iter.limit) { | |
189 | // stop in incremental mode when we reach the input limit | |
190 | // in case there are additional characters that could change the | |
191 | // normalization result | |
192 | ||
193 | // UNLESS all characters in the result of the normalization of | |
194 | // the last run are in the skippable set | |
195 | const UChar *s=output.getBuffer(); | |
196 | int32_t i=0, outLength=output.length(); | |
197 | UChar32 c; | |
198 | ||
199 | while(i<outLength) { | |
200 | U16_NEXT(s, i, outLength, c); | |
201 | if(!unorm_isNFSkippable(c, fMode)) { | |
202 | outLength=-1; // I wish C++ had labeled loops and break outer; ... | |
203 | break; | |
204 | } | |
205 | } | |
206 | if (outLength<0) { | |
207 | break; | |
208 | } | |
209 | } | |
210 | ||
211 | if(neededToNormalize) { | |
212 | // replace the input chunk with its normalized form | |
213 | text.handleReplaceBetween(start, limit, output); | |
214 | ||
215 | // update all necessary indexes accordingly | |
216 | delta = length - (limit - start); // length change in the text object | |
217 | start = limit += delta; // the next chunk starts where this one ends, with adjustment | |
218 | limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range | |
219 | offsets.contextLimit += delta; | |
220 | } else { | |
221 | // delta == 0 | |
222 | start = limit; | |
223 | limit = offsets.limit; | |
224 | } | |
225 | } | |
226 | ||
227 | offsets.start = start; | |
228 | } | |
229 | ||
230 | U_NAMESPACE_END | |
231 | ||
232 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |