]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ************************************************************************* | |
3 | * COPYRIGHT: | |
4 | * Copyright (c) 1996-2005, International Business Machines Corporation and | |
5 | * others. All Rights Reserved. | |
6 | ************************************************************************* | |
7 | */ | |
8 | ||
9 | #include "unicode/utypes.h" | |
10 | ||
11 | #if !UCONFIG_NO_NORMALIZATION | |
12 | ||
13 | #include "unicode/unistr.h" | |
14 | #include "unicode/chariter.h" | |
15 | #include "unicode/schriter.h" | |
16 | #include "unicode/uchriter.h" | |
17 | #include "unicode/uiter.h" | |
18 | #include "unicode/normlzr.h" | |
19 | #include "cmemory.h" | |
20 | #include "unormimp.h" | |
21 | ||
22 | U_NAMESPACE_BEGIN | |
23 | ||
24 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) | |
25 | ||
26 | //------------------------------------------------------------------------- | |
27 | // Constructors and other boilerplate | |
28 | //------------------------------------------------------------------------- | |
29 | ||
30 | Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : | |
31 | UObject(), fUMode(mode), fOptions(0), | |
32 | currentIndex(0), nextIndex(0), | |
33 | buffer(), bufferPos(0) | |
34 | { | |
35 | init(new StringCharacterIterator(str)); | |
36 | } | |
37 | ||
38 | Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : | |
39 | UObject(), fUMode(mode), fOptions(0), | |
40 | currentIndex(0), nextIndex(0), | |
41 | buffer(), bufferPos(0) | |
42 | { | |
43 | init(new UCharCharacterIterator(str, length)); | |
44 | } | |
45 | ||
46 | Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : | |
47 | UObject(), fUMode(mode), fOptions(0), | |
48 | currentIndex(0), nextIndex(0), | |
49 | buffer(), bufferPos(0) | |
50 | { | |
51 | init(iter.clone()); | |
52 | } | |
53 | ||
54 | Normalizer::Normalizer(const Normalizer ©) : | |
55 | UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions), | |
56 | currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), | |
57 | buffer(copy.buffer), bufferPos(copy.bufferPos) | |
58 | { | |
59 | init(((CharacterIterator *)(copy.text->context))->clone()); | |
60 | } | |
61 | ||
62 | static const UChar _NUL=0; | |
63 | ||
64 | void | |
65 | Normalizer::init(CharacterIterator *iter) { | |
66 | UErrorCode errorCode=U_ZERO_ERROR; | |
67 | ||
68 | text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator)); | |
69 | if(text!=NULL) { | |
70 | if(unorm_haveData(&errorCode)) { | |
71 | uiter_setCharacterIterator(text, iter); | |
72 | } else { | |
73 | delete iter; | |
74 | uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0)); | |
75 | } | |
76 | } else { | |
77 | delete iter; | |
78 | } | |
79 | } | |
80 | ||
81 | Normalizer::~Normalizer() | |
82 | { | |
83 | if(text!=NULL) { | |
84 | delete (CharacterIterator *)text->context; | |
85 | uprv_free(text); | |
86 | } | |
87 | } | |
88 | ||
89 | Normalizer* | |
90 | Normalizer::clone() const | |
91 | { | |
92 | if(this!=0) { | |
93 | return new Normalizer(*this); | |
94 | } else { | |
95 | return 0; | |
96 | } | |
97 | } | |
98 | ||
99 | /** | |
100 | * Generates a hash code for this iterator. | |
101 | */ | |
102 | int32_t Normalizer::hashCode() const | |
103 | { | |
104 | return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; | |
105 | } | |
106 | ||
107 | UBool Normalizer::operator==(const Normalizer& that) const | |
108 | { | |
109 | return | |
110 | this==&that || | |
111 | fUMode==that.fUMode && | |
112 | fOptions==that.fOptions && | |
113 | *((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) && | |
114 | buffer==that.buffer && | |
115 | bufferPos==that.bufferPos && | |
116 | nextIndex==that.nextIndex; | |
117 | } | |
118 | ||
119 | //------------------------------------------------------------------------- | |
120 | // Static utility methods | |
121 | //------------------------------------------------------------------------- | |
122 | ||
123 | void U_EXPORT2 | |
124 | Normalizer::normalize(const UnicodeString& source, | |
125 | UNormalizationMode mode, int32_t options, | |
126 | UnicodeString& result, | |
127 | UErrorCode &status) { | |
128 | if(source.isBogus() || U_FAILURE(status)) { | |
129 | result.setToBogus(); | |
130 | if(U_SUCCESS(status)) { | |
131 | status=U_ILLEGAL_ARGUMENT_ERROR; | |
132 | } | |
133 | } else { | |
134 | UnicodeString localDest; | |
135 | UnicodeString *dest; | |
136 | ||
137 | if(&source!=&result) { | |
138 | dest=&result; | |
139 | } else { | |
140 | // the source and result strings are the same object, use a temporary one | |
141 | dest=&localDest; | |
142 | } | |
143 | ||
144 | UChar *buffer=dest->getBuffer(source.length()); | |
145 | int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(), | |
146 | source.getBuffer(), source.length(), | |
147 | mode, options, | |
148 | &status); | |
149 | dest->releaseBuffer(U_SUCCESS(status) ? length : 0); | |
150 | if(status==U_BUFFER_OVERFLOW_ERROR) { | |
151 | status=U_ZERO_ERROR; | |
152 | buffer=dest->getBuffer(length); | |
153 | length=unorm_internalNormalize(buffer, dest->getCapacity(), | |
154 | source.getBuffer(), source.length(), | |
155 | mode, options, | |
156 | &status); | |
157 | dest->releaseBuffer(U_SUCCESS(status) ? length : 0); | |
158 | } | |
159 | ||
160 | if(dest==&localDest) { | |
161 | result=*dest; | |
162 | } | |
163 | if(U_FAILURE(status)) { | |
164 | result.setToBogus(); | |
165 | } | |
166 | } | |
167 | } | |
168 | ||
169 | void U_EXPORT2 | |
170 | Normalizer::compose(const UnicodeString& source, | |
171 | UBool compat, int32_t options, | |
172 | UnicodeString& result, | |
173 | UErrorCode &status) { | |
174 | if(source.isBogus() || U_FAILURE(status)) { | |
175 | result.setToBogus(); | |
176 | if(U_SUCCESS(status)) { | |
177 | status=U_ILLEGAL_ARGUMENT_ERROR; | |
178 | } | |
179 | } else { | |
180 | UnicodeString localDest; | |
181 | UnicodeString *dest; | |
182 | ||
183 | if(&source!=&result) { | |
184 | dest=&result; | |
185 | } else { | |
186 | // the source and result strings are the same object, use a temporary one | |
187 | dest=&localDest; | |
188 | } | |
189 | ||
190 | UChar *buffer=dest->getBuffer(source.length()); | |
191 | int32_t length=unorm_compose(buffer, dest->getCapacity(), | |
192 | source.getBuffer(), source.length(), | |
193 | compat, options, | |
194 | &status); | |
195 | dest->releaseBuffer(U_SUCCESS(status) ? length : 0); | |
196 | if(status==U_BUFFER_OVERFLOW_ERROR) { | |
197 | status=U_ZERO_ERROR; | |
198 | buffer=dest->getBuffer(length); | |
199 | length=unorm_compose(buffer, dest->getCapacity(), | |
200 | source.getBuffer(), source.length(), | |
201 | compat, options, | |
202 | &status); | |
203 | dest->releaseBuffer(U_SUCCESS(status) ? length : 0); | |
204 | } | |
205 | ||
206 | if(dest==&localDest) { | |
207 | result=*dest; | |
208 | } | |
209 | if(U_FAILURE(status)) { | |
210 | result.setToBogus(); | |
211 | } | |
212 | } | |
213 | } | |
214 | ||
215 | void U_EXPORT2 | |
216 | Normalizer::decompose(const UnicodeString& source, | |
217 | UBool compat, int32_t options, | |
218 | UnicodeString& result, | |
219 | UErrorCode &status) { | |
220 | if(source.isBogus() || U_FAILURE(status)) { | |
221 | result.setToBogus(); | |
222 | if(U_SUCCESS(status)) { | |
223 | status=U_ILLEGAL_ARGUMENT_ERROR; | |
224 | } | |
225 | } else { | |
226 | UnicodeString localDest; | |
227 | UnicodeString *dest; | |
228 | ||
229 | if(&source!=&result) { | |
230 | dest=&result; | |
231 | } else { | |
232 | // the source and result strings are the same object, use a temporary one | |
233 | dest=&localDest; | |
234 | } | |
235 | ||
236 | UChar *buffer=dest->getBuffer(source.length()); | |
237 | int32_t length=unorm_decompose(buffer, dest->getCapacity(), | |
238 | source.getBuffer(), source.length(), | |
239 | compat, options, | |
240 | &status); | |
241 | dest->releaseBuffer(U_SUCCESS(status) ? length : 0); | |
242 | if(status==U_BUFFER_OVERFLOW_ERROR) { | |
243 | status=U_ZERO_ERROR; | |
244 | buffer=dest->getBuffer(length); | |
245 | length=unorm_decompose(buffer, dest->getCapacity(), | |
246 | source.getBuffer(), source.length(), | |
247 | compat, options, | |
248 | &status); | |
249 | dest->releaseBuffer(U_SUCCESS(status) ? length : 0); | |
250 | } | |
251 | ||
252 | if(dest==&localDest) { | |
253 | result=*dest; | |
254 | } | |
255 | if(U_FAILURE(status)) { | |
256 | result.setToBogus(); | |
257 | } | |
258 | } | |
259 | } | |
260 | ||
261 | UnicodeString & U_EXPORT2 | |
262 | Normalizer::concatenate(UnicodeString &left, UnicodeString &right, | |
263 | UnicodeString &result, | |
264 | UNormalizationMode mode, int32_t options, | |
265 | UErrorCode &errorCode) { | |
266 | if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { | |
267 | result.setToBogus(); | |
268 | if(U_SUCCESS(errorCode)) { | |
269 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
270 | } | |
271 | } else { | |
272 | UnicodeString localDest; | |
273 | UnicodeString *dest; | |
274 | ||
275 | if(&left!=&result && &right!=&result) { | |
276 | dest=&result; | |
277 | } else { | |
278 | // the source and result strings are the same object, use a temporary one | |
279 | dest=&localDest; | |
280 | } | |
281 | ||
282 | UChar *buffer=dest->getBuffer(left.length()+right.length()); | |
283 | int32_t length=unorm_concatenate(left.getBuffer(), left.length(), | |
284 | right.getBuffer(), right.length(), | |
285 | buffer, dest->getCapacity(), | |
286 | mode, options, | |
287 | &errorCode); | |
288 | dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0); | |
289 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { | |
290 | errorCode=U_ZERO_ERROR; | |
291 | buffer=dest->getBuffer(length); | |
292 | int32_t length=unorm_concatenate(left.getBuffer(), left.length(), | |
293 | right.getBuffer(), right.length(), | |
294 | buffer, dest->getCapacity(), | |
295 | mode, options, | |
296 | &errorCode); | |
297 | dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0); | |
298 | } | |
299 | ||
300 | if(dest==&localDest) { | |
301 | result=*dest; | |
302 | } | |
303 | if(U_FAILURE(errorCode)) { | |
304 | result.setToBogus(); | |
305 | } | |
306 | } | |
307 | return result; | |
308 | } | |
309 | ||
310 | //------------------------------------------------------------------------- | |
311 | // Iteration API | |
312 | //------------------------------------------------------------------------- | |
313 | ||
314 | /** | |
315 | * Return the current character in the normalized text. | |
316 | */ | |
317 | UChar32 Normalizer::current() { | |
318 | if(bufferPos<buffer.length() || nextNormalize()) { | |
319 | return buffer.char32At(bufferPos); | |
320 | } else { | |
321 | return DONE; | |
322 | } | |
323 | } | |
324 | ||
325 | /** | |
326 | * Return the next character in the normalized text and advance | |
327 | * the iteration position by one. If the end | |
328 | * of the text has already been reached, {@link #DONE} is returned. | |
329 | */ | |
330 | UChar32 Normalizer::next() { | |
331 | if(bufferPos<buffer.length() || nextNormalize()) { | |
332 | UChar32 c=buffer.char32At(bufferPos); | |
333 | bufferPos+=UTF_CHAR_LENGTH(c); | |
334 | return c; | |
335 | } else { | |
336 | return DONE; | |
337 | } | |
338 | } | |
339 | ||
340 | /** | |
341 | * Return the previous character in the normalized text and decrement | |
342 | * the iteration position by one. If the beginning | |
343 | * of the text has already been reached, {@link #DONE} is returned. | |
344 | */ | |
345 | UChar32 Normalizer::previous() { | |
346 | if(bufferPos>0 || previousNormalize()) { | |
347 | UChar32 c=buffer.char32At(bufferPos-1); | |
348 | bufferPos-=UTF_CHAR_LENGTH(c); | |
349 | return c; | |
350 | } else { | |
351 | return DONE; | |
352 | } | |
353 | } | |
354 | ||
355 | void Normalizer::reset() { | |
356 | currentIndex=nextIndex=text->move(text, 0, UITER_START); | |
357 | clearBuffer(); | |
358 | } | |
359 | ||
360 | void | |
361 | Normalizer::setIndexOnly(int32_t index) { | |
362 | currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index | |
363 | clearBuffer(); | |
364 | } | |
365 | ||
366 | /** | |
367 | * Return the first character in the normalized text-> This resets | |
368 | * the <tt>Normalizer's</tt> position to the beginning of the text-> | |
369 | */ | |
370 | UChar32 Normalizer::first() { | |
371 | reset(); | |
372 | return next(); | |
373 | } | |
374 | ||
375 | /** | |
376 | * Return the last character in the normalized text-> This resets | |
377 | * the <tt>Normalizer's</tt> position to be just before the | |
378 | * the input text corresponding to that normalized character. | |
379 | */ | |
380 | UChar32 Normalizer::last() { | |
381 | currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT); | |
382 | clearBuffer(); | |
383 | return previous(); | |
384 | } | |
385 | ||
386 | /** | |
387 | * Retrieve the current iteration position in the input text that is | |
388 | * being normalized. This method is useful in applications such as | |
389 | * searching, where you need to be able to determine the position in | |
390 | * the input text that corresponds to a given normalized output character. | |
391 | * <p> | |
392 | * <b>Note:</b> This method sets the position in the <em>input</em>, while | |
393 | * {@link #next} and {@link #previous} iterate through characters in the | |
394 | * <em>output</em>. This means that there is not necessarily a one-to-one | |
395 | * correspondence between characters returned by <tt>next</tt> and | |
396 | * <tt>previous</tt> and the indices passed to and returned from | |
397 | * <tt>setIndex</tt> and {@link #getIndex}. | |
398 | * | |
399 | */ | |
400 | int32_t Normalizer::getIndex() const { | |
401 | if(bufferPos<buffer.length()) { | |
402 | return currentIndex; | |
403 | } else { | |
404 | return nextIndex; | |
405 | } | |
406 | } | |
407 | ||
408 | /** | |
409 | * Retrieve the index of the start of the input text-> This is the begin index | |
410 | * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> | |
411 | * over which this <tt>Normalizer</tt> is iterating | |
412 | */ | |
413 | int32_t Normalizer::startIndex() const { | |
414 | return text->getIndex(text, UITER_START); | |
415 | } | |
416 | ||
417 | /** | |
418 | * Retrieve the index of the end of the input text-> This is the end index | |
419 | * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> | |
420 | * over which this <tt>Normalizer</tt> is iterating | |
421 | */ | |
422 | int32_t Normalizer::endIndex() const { | |
423 | return text->getIndex(text, UITER_LIMIT); | |
424 | } | |
425 | ||
426 | //------------------------------------------------------------------------- | |
427 | // Property access methods | |
428 | //------------------------------------------------------------------------- | |
429 | ||
430 | void | |
431 | Normalizer::setMode(UNormalizationMode newMode) | |
432 | { | |
433 | fUMode = newMode; | |
434 | } | |
435 | ||
436 | UNormalizationMode | |
437 | Normalizer::getUMode() const | |
438 | { | |
439 | return fUMode; | |
440 | } | |
441 | ||
442 | void | |
443 | Normalizer::setOption(int32_t option, | |
444 | UBool value) | |
445 | { | |
446 | if (value) { | |
447 | fOptions |= option; | |
448 | } else { | |
449 | fOptions &= (~option); | |
450 | } | |
451 | } | |
452 | ||
453 | UBool | |
454 | Normalizer::getOption(int32_t option) const | |
455 | { | |
456 | return (fOptions & option) != 0; | |
457 | } | |
458 | ||
459 | /** | |
460 | * Set the input text over which this <tt>Normalizer</tt> will iterate. | |
461 | * The iteration position is set to the beginning of the input text-> | |
462 | */ | |
463 | void | |
464 | Normalizer::setText(const UnicodeString& newText, | |
465 | UErrorCode &status) | |
466 | { | |
467 | if (U_FAILURE(status)) { | |
468 | return; | |
469 | } | |
470 | CharacterIterator *newIter = new StringCharacterIterator(newText); | |
471 | if (newIter == NULL) { | |
472 | status = U_MEMORY_ALLOCATION_ERROR; | |
473 | return; | |
474 | } | |
475 | delete (CharacterIterator *)(text->context); | |
476 | text->context = newIter; | |
477 | reset(); | |
478 | } | |
479 | ||
480 | /** | |
481 | * Set the input text over which this <tt>Normalizer</tt> will iterate. | |
482 | * The iteration position is set to the beginning of the string. | |
483 | */ | |
484 | void | |
485 | Normalizer::setText(const CharacterIterator& newText, | |
486 | UErrorCode &status) | |
487 | { | |
488 | if (U_FAILURE(status)) { | |
489 | return; | |
490 | } | |
491 | CharacterIterator *newIter = newText.clone(); | |
492 | if (newIter == NULL) { | |
493 | status = U_MEMORY_ALLOCATION_ERROR; | |
494 | return; | |
495 | } | |
496 | delete (CharacterIterator *)(text->context); | |
497 | text->context = newIter; | |
498 | reset(); | |
499 | } | |
500 | ||
501 | void | |
502 | Normalizer::setText(const UChar* newText, | |
503 | int32_t length, | |
504 | UErrorCode &status) | |
505 | { | |
506 | if (U_FAILURE(status)) { | |
507 | return; | |
508 | } | |
509 | CharacterIterator *newIter = new UCharCharacterIterator(newText, length); | |
510 | if (newIter == NULL) { | |
511 | status = U_MEMORY_ALLOCATION_ERROR; | |
512 | return; | |
513 | } | |
514 | delete (CharacterIterator *)(text->context); | |
515 | text->context = newIter; | |
516 | reset(); | |
517 | } | |
518 | ||
519 | /** | |
520 | * Copies the text under iteration into the UnicodeString referred to by "result". | |
521 | * @param result Receives a copy of the text under iteration. | |
522 | */ | |
523 | void | |
524 | Normalizer::getText(UnicodeString& result) | |
525 | { | |
526 | ((CharacterIterator *)(text->context))->getText(result); | |
527 | } | |
528 | ||
529 | //------------------------------------------------------------------------- | |
530 | // Private utility methods | |
531 | //------------------------------------------------------------------------- | |
532 | ||
533 | void Normalizer::clearBuffer() { | |
534 | buffer.remove(); | |
535 | bufferPos=0; | |
536 | } | |
537 | ||
538 | UBool | |
539 | Normalizer::nextNormalize() { | |
540 | UChar *p; | |
541 | int32_t length; | |
542 | UErrorCode errorCode; | |
543 | ||
544 | clearBuffer(); | |
545 | currentIndex=nextIndex; | |
546 | text->move(text, nextIndex, UITER_ZERO); | |
547 | if(!text->hasNext(text)) { | |
548 | return FALSE; | |
549 | } | |
550 | ||
551 | errorCode=U_ZERO_ERROR; | |
552 | p=buffer.getBuffer(-1); | |
553 | length=unorm_next(text, p, buffer.getCapacity(), | |
554 | fUMode, fOptions, | |
555 | TRUE, 0, | |
556 | &errorCode); | |
557 | buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); | |
558 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { | |
559 | errorCode=U_ZERO_ERROR; | |
560 | text->move(text, nextIndex, UITER_ZERO); | |
561 | p=buffer.getBuffer(length); | |
562 | length=unorm_next(text, p, buffer.getCapacity(), | |
563 | fUMode, fOptions, | |
564 | TRUE, 0, | |
565 | &errorCode); | |
566 | buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); | |
567 | } | |
568 | ||
569 | nextIndex=text->getIndex(text, UITER_CURRENT); | |
570 | return U_SUCCESS(errorCode) && !buffer.isEmpty(); | |
571 | } | |
572 | ||
573 | UBool | |
574 | Normalizer::previousNormalize() { | |
575 | UChar *p; | |
576 | int32_t length; | |
577 | UErrorCode errorCode; | |
578 | ||
579 | clearBuffer(); | |
580 | nextIndex=currentIndex; | |
581 | text->move(text, currentIndex, UITER_ZERO); | |
582 | if(!text->hasPrevious(text)) { | |
583 | return FALSE; | |
584 | } | |
585 | ||
586 | errorCode=U_ZERO_ERROR; | |
587 | p=buffer.getBuffer(-1); | |
588 | length=unorm_previous(text, p, buffer.getCapacity(), | |
589 | fUMode, fOptions, | |
590 | TRUE, 0, | |
591 | &errorCode); | |
592 | buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); | |
593 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { | |
594 | errorCode=U_ZERO_ERROR; | |
595 | text->move(text, currentIndex, UITER_ZERO); | |
596 | p=buffer.getBuffer(length); | |
597 | length=unorm_previous(text, p, buffer.getCapacity(), | |
598 | fUMode, fOptions, | |
599 | TRUE, 0, | |
600 | &errorCode); | |
601 | buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); | |
602 | } | |
603 | ||
604 | bufferPos=buffer.length(); | |
605 | currentIndex=text->getIndex(text, UITER_CURRENT); | |
606 | return U_SUCCESS(errorCode) && !buffer.isEmpty(); | |
607 | } | |
608 | ||
609 | U_NAMESPACE_END | |
610 | ||
611 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |