]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ************************************************************************* | |
3 | * COPYRIGHT: | |
729e4ab9 | 4 | * Copyright (c) 1996-2010, International Business Machines Corporation and |
b75a7d8f A |
5 | * others. All Rights Reserved. |
6 | ************************************************************************* | |
7 | */ | |
8 | ||
9 | #include "unicode/utypes.h" | |
10 | ||
11 | #if !UCONFIG_NO_NORMALIZATION | |
12 | ||
729e4ab9 | 13 | #include "unicode/uniset.h" |
b75a7d8f A |
14 | #include "unicode/unistr.h" |
15 | #include "unicode/chariter.h" | |
16 | #include "unicode/schriter.h" | |
17 | #include "unicode/uchriter.h" | |
b75a7d8f A |
18 | #include "unicode/normlzr.h" |
19 | #include "cmemory.h" | |
729e4ab9 A |
20 | #include "normalizer2impl.h" |
21 | #include "uprops.h" // for uniset_getUnicode32Instance() | |
b75a7d8f A |
22 | |
23 | U_NAMESPACE_BEGIN | |
24 | ||
374ca955 | 25 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) |
b75a7d8f A |
26 | |
27 | //------------------------------------------------------------------------- | |
28 | // Constructors and other boilerplate | |
29 | //------------------------------------------------------------------------- | |
30 | ||
31 | Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : | |
729e4ab9 A |
32 | UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
33 | text(new StringCharacterIterator(str)), | |
b75a7d8f A |
34 | currentIndex(0), nextIndex(0), |
35 | buffer(), bufferPos(0) | |
36 | { | |
729e4ab9 | 37 | init(); |
b75a7d8f A |
38 | } |
39 | ||
40 | Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : | |
729e4ab9 A |
41 | UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
42 | text(new UCharCharacterIterator(str, length)), | |
b75a7d8f A |
43 | currentIndex(0), nextIndex(0), |
44 | buffer(), bufferPos(0) | |
45 | { | |
729e4ab9 | 46 | init(); |
b75a7d8f A |
47 | } |
48 | ||
49 | Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : | |
729e4ab9 A |
50 | UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
51 | text(iter.clone()), | |
b75a7d8f A |
52 | currentIndex(0), nextIndex(0), |
53 | buffer(), bufferPos(0) | |
54 | { | |
729e4ab9 | 55 | init(); |
b75a7d8f A |
56 | } |
57 | ||
58 | Normalizer::Normalizer(const Normalizer ©) : | |
729e4ab9 A |
59 | UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), |
60 | text(copy.text->clone()), | |
b75a7d8f A |
61 | currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), |
62 | buffer(copy.buffer), bufferPos(copy.bufferPos) | |
63 | { | |
729e4ab9 | 64 | init(); |
b75a7d8f A |
65 | } |
66 | ||
67 | static const UChar _NUL=0; | |
68 | ||
69 | void | |
729e4ab9 | 70 | Normalizer::init() { |
b75a7d8f | 71 | UErrorCode errorCode=U_ZERO_ERROR; |
729e4ab9 A |
72 | fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); |
73 | if(fOptions&UNORM_UNICODE_3_2) { | |
74 | delete fFilteredNorm2; | |
75 | fNorm2=fFilteredNorm2= | |
76 | new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); | |
77 | } | |
78 | if(U_FAILURE(errorCode)) { | |
79 | errorCode=U_ZERO_ERROR; | |
80 | fNorm2=Normalizer2Factory::getNoopInstance(errorCode); | |
b75a7d8f A |
81 | } |
82 | } | |
83 | ||
84 | Normalizer::~Normalizer() | |
85 | { | |
729e4ab9 A |
86 | delete fFilteredNorm2; |
87 | delete text; | |
b75a7d8f A |
88 | } |
89 | ||
90 | Normalizer* | |
91 | Normalizer::clone() const | |
92 | { | |
729e4ab9 | 93 | return new Normalizer(*this); |
b75a7d8f A |
94 | } |
95 | ||
96 | /** | |
97 | * Generates a hash code for this iterator. | |
98 | */ | |
99 | int32_t Normalizer::hashCode() const | |
100 | { | |
729e4ab9 | 101 | return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; |
b75a7d8f A |
102 | } |
103 | ||
104 | UBool Normalizer::operator==(const Normalizer& that) const | |
105 | { | |
106 | return | |
107 | this==&that || | |
729e4ab9 | 108 | (fUMode==that.fUMode && |
b75a7d8f | 109 | fOptions==that.fOptions && |
729e4ab9 | 110 | *text==*that.text && |
b75a7d8f A |
111 | buffer==that.buffer && |
112 | bufferPos==that.bufferPos && | |
729e4ab9 | 113 | nextIndex==that.nextIndex); |
b75a7d8f A |
114 | } |
115 | ||
116 | //------------------------------------------------------------------------- | |
117 | // Static utility methods | |
118 | //------------------------------------------------------------------------- | |
119 | ||
374ca955 | 120 | void U_EXPORT2 |
b75a7d8f A |
121 | Normalizer::normalize(const UnicodeString& source, |
122 | UNormalizationMode mode, int32_t options, | |
123 | UnicodeString& result, | |
124 | UErrorCode &status) { | |
125 | if(source.isBogus() || U_FAILURE(status)) { | |
126 | result.setToBogus(); | |
127 | if(U_SUCCESS(status)) { | |
128 | status=U_ILLEGAL_ARGUMENT_ERROR; | |
129 | } | |
130 | } else { | |
131 | UnicodeString localDest; | |
132 | UnicodeString *dest; | |
133 | ||
134 | if(&source!=&result) { | |
135 | dest=&result; | |
136 | } else { | |
137 | // the source and result strings are the same object, use a temporary one | |
138 | dest=&localDest; | |
139 | } | |
729e4ab9 A |
140 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
141 | if(U_SUCCESS(status)) { | |
142 | if(options&UNORM_UNICODE_3_2) { | |
143 | FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). | |
144 | normalize(source, *dest, status); | |
145 | } else { | |
146 | n2->normalize(source, *dest, status); | |
147 | } | |
b75a7d8f | 148 | } |
729e4ab9 | 149 | if(dest==&localDest && U_SUCCESS(status)) { |
b75a7d8f A |
150 | result=*dest; |
151 | } | |
b75a7d8f A |
152 | } |
153 | } | |
154 | ||
374ca955 | 155 | void U_EXPORT2 |
b75a7d8f A |
156 | Normalizer::compose(const UnicodeString& source, |
157 | UBool compat, int32_t options, | |
158 | UnicodeString& result, | |
159 | UErrorCode &status) { | |
729e4ab9 | 160 | normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); |
b75a7d8f A |
161 | } |
162 | ||
374ca955 | 163 | void U_EXPORT2 |
b75a7d8f A |
164 | Normalizer::decompose(const UnicodeString& source, |
165 | UBool compat, int32_t options, | |
166 | UnicodeString& result, | |
167 | UErrorCode &status) { | |
729e4ab9 A |
168 | normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); |
169 | } | |
170 | ||
171 | UNormalizationCheckResult | |
172 | Normalizer::quickCheck(const UnicodeString& source, | |
173 | UNormalizationMode mode, int32_t options, | |
174 | UErrorCode &status) { | |
175 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); | |
176 | if(U_SUCCESS(status)) { | |
177 | if(options&UNORM_UNICODE_3_2) { | |
178 | return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). | |
179 | quickCheck(source, status); | |
180 | } else { | |
181 | return n2->quickCheck(source, status); | |
b75a7d8f A |
182 | } |
183 | } else { | |
729e4ab9 A |
184 | return UNORM_MAYBE; |
185 | } | |
186 | } | |
b75a7d8f | 187 | |
729e4ab9 A |
188 | UBool |
189 | Normalizer::isNormalized(const UnicodeString& source, | |
190 | UNormalizationMode mode, int32_t options, | |
191 | UErrorCode &status) { | |
192 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); | |
193 | if(U_SUCCESS(status)) { | |
194 | if(options&UNORM_UNICODE_3_2) { | |
195 | return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). | |
196 | isNormalized(source, status); | |
b75a7d8f | 197 | } else { |
729e4ab9 | 198 | return n2->isNormalized(source, status); |
b75a7d8f | 199 | } |
729e4ab9 A |
200 | } else { |
201 | return FALSE; | |
b75a7d8f A |
202 | } |
203 | } | |
204 | ||
374ca955 | 205 | UnicodeString & U_EXPORT2 |
b75a7d8f A |
206 | Normalizer::concatenate(UnicodeString &left, UnicodeString &right, |
207 | UnicodeString &result, | |
208 | UNormalizationMode mode, int32_t options, | |
209 | UErrorCode &errorCode) { | |
210 | if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { | |
211 | result.setToBogus(); | |
212 | if(U_SUCCESS(errorCode)) { | |
213 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
214 | } | |
215 | } else { | |
216 | UnicodeString localDest; | |
217 | UnicodeString *dest; | |
218 | ||
729e4ab9 | 219 | if(&right!=&result) { |
b75a7d8f A |
220 | dest=&result; |
221 | } else { | |
729e4ab9 | 222 | // the right and result strings are the same object, use a temporary one |
b75a7d8f A |
223 | dest=&localDest; |
224 | } | |
729e4ab9 A |
225 | *dest=left; |
226 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); | |
227 | if(U_SUCCESS(errorCode)) { | |
228 | if(options&UNORM_UNICODE_3_2) { | |
229 | FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). | |
230 | append(*dest, right, errorCode); | |
231 | } else { | |
232 | n2->append(*dest, right, errorCode); | |
233 | } | |
b75a7d8f | 234 | } |
729e4ab9 | 235 | if(dest==&localDest && U_SUCCESS(errorCode)) { |
b75a7d8f A |
236 | result=*dest; |
237 | } | |
b75a7d8f A |
238 | } |
239 | return result; | |
240 | } | |
241 | ||
242 | //------------------------------------------------------------------------- | |
243 | // Iteration API | |
244 | //------------------------------------------------------------------------- | |
245 | ||
246 | /** | |
247 | * Return the current character in the normalized text. | |
248 | */ | |
249 | UChar32 Normalizer::current() { | |
250 | if(bufferPos<buffer.length() || nextNormalize()) { | |
251 | return buffer.char32At(bufferPos); | |
252 | } else { | |
253 | return DONE; | |
254 | } | |
255 | } | |
256 | ||
257 | /** | |
258 | * Return the next character in the normalized text and advance | |
259 | * the iteration position by one. If the end | |
260 | * of the text has already been reached, {@link #DONE} is returned. | |
261 | */ | |
262 | UChar32 Normalizer::next() { | |
263 | if(bufferPos<buffer.length() || nextNormalize()) { | |
264 | UChar32 c=buffer.char32At(bufferPos); | |
265 | bufferPos+=UTF_CHAR_LENGTH(c); | |
266 | return c; | |
267 | } else { | |
268 | return DONE; | |
269 | } | |
270 | } | |
271 | ||
272 | /** | |
273 | * Return the previous character in the normalized text and decrement | |
274 | * the iteration position by one. If the beginning | |
275 | * of the text has already been reached, {@link #DONE} is returned. | |
276 | */ | |
277 | UChar32 Normalizer::previous() { | |
278 | if(bufferPos>0 || previousNormalize()) { | |
279 | UChar32 c=buffer.char32At(bufferPos-1); | |
280 | bufferPos-=UTF_CHAR_LENGTH(c); | |
281 | return c; | |
282 | } else { | |
283 | return DONE; | |
284 | } | |
285 | } | |
286 | ||
287 | void Normalizer::reset() { | |
729e4ab9 | 288 | currentIndex=nextIndex=text->setToStart(); |
b75a7d8f A |
289 | clearBuffer(); |
290 | } | |
291 | ||
292 | void | |
293 | Normalizer::setIndexOnly(int32_t index) { | |
729e4ab9 A |
294 | text->setIndex(index); // pins index |
295 | currentIndex=nextIndex=text->getIndex(); | |
b75a7d8f A |
296 | clearBuffer(); |
297 | } | |
298 | ||
299 | /** | |
729e4ab9 A |
300 | * Return the first character in the normalized text. This resets |
301 | * the <tt>Normalizer's</tt> position to the beginning of the text. | |
b75a7d8f A |
302 | */ |
303 | UChar32 Normalizer::first() { | |
304 | reset(); | |
305 | return next(); | |
306 | } | |
307 | ||
308 | /** | |
729e4ab9 | 309 | * Return the last character in the normalized text. This resets |
b75a7d8f A |
310 | * the <tt>Normalizer's</tt> position to be just before the |
311 | * the input text corresponding to that normalized character. | |
312 | */ | |
313 | UChar32 Normalizer::last() { | |
729e4ab9 | 314 | currentIndex=nextIndex=text->setToEnd(); |
b75a7d8f A |
315 | clearBuffer(); |
316 | return previous(); | |
317 | } | |
318 | ||
319 | /** | |
320 | * Retrieve the current iteration position in the input text that is | |
321 | * being normalized. This method is useful in applications such as | |
322 | * searching, where you need to be able to determine the position in | |
323 | * the input text that corresponds to a given normalized output character. | |
324 | * <p> | |
325 | * <b>Note:</b> This method sets the position in the <em>input</em>, while | |
326 | * {@link #next} and {@link #previous} iterate through characters in the | |
327 | * <em>output</em>. This means that there is not necessarily a one-to-one | |
328 | * correspondence between characters returned by <tt>next</tt> and | |
329 | * <tt>previous</tt> and the indices passed to and returned from | |
330 | * <tt>setIndex</tt> and {@link #getIndex}. | |
331 | * | |
332 | */ | |
333 | int32_t Normalizer::getIndex() const { | |
334 | if(bufferPos<buffer.length()) { | |
335 | return currentIndex; | |
336 | } else { | |
337 | return nextIndex; | |
338 | } | |
339 | } | |
340 | ||
341 | /** | |
729e4ab9 | 342 | * Retrieve the index of the start of the input text. This is the begin index |
b75a7d8f A |
343 | * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> |
344 | * over which this <tt>Normalizer</tt> is iterating | |
345 | */ | |
346 | int32_t Normalizer::startIndex() const { | |
729e4ab9 | 347 | return text->startIndex(); |
b75a7d8f A |
348 | } |
349 | ||
350 | /** | |
729e4ab9 | 351 | * Retrieve the index of the end of the input text. This is the end index |
b75a7d8f A |
352 | * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> |
353 | * over which this <tt>Normalizer</tt> is iterating | |
354 | */ | |
355 | int32_t Normalizer::endIndex() const { | |
729e4ab9 | 356 | return text->endIndex(); |
b75a7d8f A |
357 | } |
358 | ||
359 | //------------------------------------------------------------------------- | |
360 | // Property access methods | |
361 | //------------------------------------------------------------------------- | |
362 | ||
363 | void | |
364 | Normalizer::setMode(UNormalizationMode newMode) | |
365 | { | |
366 | fUMode = newMode; | |
729e4ab9 | 367 | init(); |
b75a7d8f A |
368 | } |
369 | ||
370 | UNormalizationMode | |
371 | Normalizer::getUMode() const | |
372 | { | |
373 | return fUMode; | |
374 | } | |
375 | ||
376 | void | |
377 | Normalizer::setOption(int32_t option, | |
378 | UBool value) | |
379 | { | |
380 | if (value) { | |
381 | fOptions |= option; | |
382 | } else { | |
383 | fOptions &= (~option); | |
384 | } | |
729e4ab9 | 385 | init(); |
b75a7d8f A |
386 | } |
387 | ||
388 | UBool | |
389 | Normalizer::getOption(int32_t option) const | |
390 | { | |
391 | return (fOptions & option) != 0; | |
392 | } | |
393 | ||
394 | /** | |
395 | * Set the input text over which this <tt>Normalizer</tt> will iterate. | |
729e4ab9 | 396 | * The iteration position is set to the beginning of the input text. |
b75a7d8f A |
397 | */ |
398 | void | |
399 | Normalizer::setText(const UnicodeString& newText, | |
400 | UErrorCode &status) | |
401 | { | |
402 | if (U_FAILURE(status)) { | |
403 | return; | |
404 | } | |
405 | CharacterIterator *newIter = new StringCharacterIterator(newText); | |
406 | if (newIter == NULL) { | |
407 | status = U_MEMORY_ALLOCATION_ERROR; | |
408 | return; | |
409 | } | |
729e4ab9 A |
410 | delete text; |
411 | text = newIter; | |
b75a7d8f A |
412 | reset(); |
413 | } | |
414 | ||
415 | /** | |
416 | * Set the input text over which this <tt>Normalizer</tt> will iterate. | |
417 | * The iteration position is set to the beginning of the string. | |
418 | */ | |
419 | void | |
420 | Normalizer::setText(const CharacterIterator& newText, | |
421 | UErrorCode &status) | |
422 | { | |
423 | if (U_FAILURE(status)) { | |
424 | return; | |
425 | } | |
426 | CharacterIterator *newIter = newText.clone(); | |
427 | if (newIter == NULL) { | |
428 | status = U_MEMORY_ALLOCATION_ERROR; | |
429 | return; | |
430 | } | |
729e4ab9 A |
431 | delete text; |
432 | text = newIter; | |
b75a7d8f A |
433 | reset(); |
434 | } | |
435 | ||
436 | void | |
437 | Normalizer::setText(const UChar* newText, | |
438 | int32_t length, | |
439 | UErrorCode &status) | |
440 | { | |
441 | if (U_FAILURE(status)) { | |
442 | return; | |
443 | } | |
444 | CharacterIterator *newIter = new UCharCharacterIterator(newText, length); | |
445 | if (newIter == NULL) { | |
446 | status = U_MEMORY_ALLOCATION_ERROR; | |
447 | return; | |
448 | } | |
729e4ab9 A |
449 | delete text; |
450 | text = newIter; | |
b75a7d8f A |
451 | reset(); |
452 | } | |
453 | ||
454 | /** | |
455 | * Copies the text under iteration into the UnicodeString referred to by "result". | |
456 | * @param result Receives a copy of the text under iteration. | |
457 | */ | |
458 | void | |
459 | Normalizer::getText(UnicodeString& result) | |
460 | { | |
729e4ab9 | 461 | text->getText(result); |
b75a7d8f A |
462 | } |
463 | ||
464 | //------------------------------------------------------------------------- | |
465 | // Private utility methods | |
466 | //------------------------------------------------------------------------- | |
467 | ||
468 | void Normalizer::clearBuffer() { | |
469 | buffer.remove(); | |
470 | bufferPos=0; | |
471 | } | |
472 | ||
473 | UBool | |
474 | Normalizer::nextNormalize() { | |
b75a7d8f A |
475 | clearBuffer(); |
476 | currentIndex=nextIndex; | |
729e4ab9 A |
477 | text->setIndex(nextIndex); |
478 | if(!text->hasNext()) { | |
b75a7d8f A |
479 | return FALSE; |
480 | } | |
729e4ab9 A |
481 | // Skip at least one character so we make progress. |
482 | UnicodeString segment(text->next32PostInc()); | |
483 | while(text->hasNext()) { | |
484 | UChar32 c; | |
485 | if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { | |
486 | text->move32(-1, CharacterIterator::kCurrent); | |
487 | break; | |
488 | } | |
489 | segment.append(c); | |
b75a7d8f | 490 | } |
729e4ab9 A |
491 | nextIndex=text->getIndex(); |
492 | UErrorCode errorCode=U_ZERO_ERROR; | |
493 | fNorm2->normalize(segment, buffer, errorCode); | |
b75a7d8f A |
494 | return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
495 | } | |
496 | ||
497 | UBool | |
498 | Normalizer::previousNormalize() { | |
b75a7d8f A |
499 | clearBuffer(); |
500 | nextIndex=currentIndex; | |
729e4ab9 A |
501 | text->setIndex(currentIndex); |
502 | if(!text->hasPrevious()) { | |
b75a7d8f A |
503 | return FALSE; |
504 | } | |
729e4ab9 A |
505 | UnicodeString segment; |
506 | while(text->hasPrevious()) { | |
507 | UChar32 c=text->previous32(); | |
508 | segment.insert(0, c); | |
509 | if(fNorm2->hasBoundaryBefore(c)) { | |
510 | break; | |
511 | } | |
b75a7d8f | 512 | } |
729e4ab9 A |
513 | currentIndex=text->getIndex(); |
514 | UErrorCode errorCode=U_ZERO_ERROR; | |
515 | fNorm2->normalize(segment, buffer, errorCode); | |
b75a7d8f | 516 | bufferPos=buffer.length(); |
b75a7d8f A |
517 | return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
518 | } | |
519 | ||
520 | U_NAMESPACE_END | |
521 | ||
522 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |