2 *************************************************************************
4 * Copyright (c) 1996-2005, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 *************************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_NORMALIZATION
13 #include "unicode/unistr.h"
14 #include "unicode/chariter.h"
15 #include "unicode/schriter.h"
16 #include "unicode/uchriter.h"
17 #include "unicode/uiter.h"
18 #include "unicode/normlzr.h"
24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer
)
26 //-------------------------------------------------------------------------
27 // Constructors and other boilerplate
28 //-------------------------------------------------------------------------
30 Normalizer::Normalizer(const UnicodeString
& str
, UNormalizationMode mode
) :
31 UObject(), fUMode(mode
), fOptions(0),
32 currentIndex(0), nextIndex(0),
33 buffer(), bufferPos(0)
35 init(new StringCharacterIterator(str
));
38 Normalizer::Normalizer(const UChar
*str
, int32_t length
, UNormalizationMode mode
) :
39 UObject(), fUMode(mode
), fOptions(0),
40 currentIndex(0), nextIndex(0),
41 buffer(), bufferPos(0)
43 init(new UCharCharacterIterator(str
, length
));
46 Normalizer::Normalizer(const CharacterIterator
& iter
, UNormalizationMode mode
) :
47 UObject(), fUMode(mode
), fOptions(0),
48 currentIndex(0), nextIndex(0),
49 buffer(), bufferPos(0)
54 Normalizer::Normalizer(const Normalizer
©
) :
55 UObject(copy
), fUMode(copy
.fUMode
), fOptions(copy
.fOptions
),
56 currentIndex(copy
.currentIndex
), nextIndex(copy
.nextIndex
),
57 buffer(copy
.buffer
), bufferPos(copy
.bufferPos
)
59 init(((CharacterIterator
*)(copy
.text
->context
))->clone());
62 static const UChar _NUL
=0;
65 Normalizer::init(CharacterIterator
*iter
) {
66 UErrorCode errorCode
=U_ZERO_ERROR
;
68 text
=(UCharIterator
*)uprv_malloc(sizeof(UCharIterator
));
70 if(unorm_haveData(&errorCode
)) {
71 uiter_setCharacterIterator(text
, iter
);
74 uiter_setCharacterIterator(text
, new UCharCharacterIterator(&_NUL
, 0));
81 Normalizer::~Normalizer()
84 delete (CharacterIterator
*)text
->context
;
90 Normalizer::clone() const
93 return new Normalizer(*this);
100 * Generates a hash code for this iterator.
102 int32_t Normalizer::hashCode() const
104 return ((CharacterIterator
*)(text
->context
))->hashCode() + fUMode
+ fOptions
+ buffer
.hashCode() + bufferPos
+ currentIndex
+ nextIndex
;
107 UBool
Normalizer::operator==(const Normalizer
& that
) const
111 fUMode
==that
.fUMode
&&
112 fOptions
==that
.fOptions
&&
113 *((CharacterIterator
*)(text
->context
))==*((CharacterIterator
*)(that
.text
->context
)) &&
114 buffer
==that
.buffer
&&
115 bufferPos
==that
.bufferPos
&&
116 nextIndex
==that
.nextIndex
;
119 //-------------------------------------------------------------------------
120 // Static utility methods
121 //-------------------------------------------------------------------------
124 Normalizer::normalize(const UnicodeString
& source
,
125 UNormalizationMode mode
, int32_t options
,
126 UnicodeString
& result
,
127 UErrorCode
&status
) {
128 if(source
.isBogus() || U_FAILURE(status
)) {
130 if(U_SUCCESS(status
)) {
131 status
=U_ILLEGAL_ARGUMENT_ERROR
;
134 UnicodeString localDest
;
137 if(&source
!=&result
) {
140 // the source and result strings are the same object, use a temporary one
144 UChar
*buffer
=dest
->getBuffer(source
.length());
145 int32_t length
=unorm_internalNormalize(buffer
, dest
->getCapacity(),
146 source
.getBuffer(), source
.length(),
149 dest
->releaseBuffer(U_SUCCESS(status
) ? length
: 0);
150 if(status
==U_BUFFER_OVERFLOW_ERROR
) {
152 buffer
=dest
->getBuffer(length
);
153 length
=unorm_internalNormalize(buffer
, dest
->getCapacity(),
154 source
.getBuffer(), source
.length(),
157 dest
->releaseBuffer(U_SUCCESS(status
) ? length
: 0);
160 if(dest
==&localDest
) {
163 if(U_FAILURE(status
)) {
170 Normalizer::compose(const UnicodeString
& source
,
171 UBool compat
, int32_t options
,
172 UnicodeString
& result
,
173 UErrorCode
&status
) {
174 if(source
.isBogus() || U_FAILURE(status
)) {
176 if(U_SUCCESS(status
)) {
177 status
=U_ILLEGAL_ARGUMENT_ERROR
;
180 UnicodeString localDest
;
183 if(&source
!=&result
) {
186 // the source and result strings are the same object, use a temporary one
190 UChar
*buffer
=dest
->getBuffer(source
.length());
191 int32_t length
=unorm_compose(buffer
, dest
->getCapacity(),
192 source
.getBuffer(), source
.length(),
195 dest
->releaseBuffer(U_SUCCESS(status
) ? length
: 0);
196 if(status
==U_BUFFER_OVERFLOW_ERROR
) {
198 buffer
=dest
->getBuffer(length
);
199 length
=unorm_compose(buffer
, dest
->getCapacity(),
200 source
.getBuffer(), source
.length(),
203 dest
->releaseBuffer(U_SUCCESS(status
) ? length
: 0);
206 if(dest
==&localDest
) {
209 if(U_FAILURE(status
)) {
216 Normalizer::decompose(const UnicodeString
& source
,
217 UBool compat
, int32_t options
,
218 UnicodeString
& result
,
219 UErrorCode
&status
) {
220 if(source
.isBogus() || U_FAILURE(status
)) {
222 if(U_SUCCESS(status
)) {
223 status
=U_ILLEGAL_ARGUMENT_ERROR
;
226 UnicodeString localDest
;
229 if(&source
!=&result
) {
232 // the source and result strings are the same object, use a temporary one
236 UChar
*buffer
=dest
->getBuffer(source
.length());
237 int32_t length
=unorm_decompose(buffer
, dest
->getCapacity(),
238 source
.getBuffer(), source
.length(),
241 dest
->releaseBuffer(U_SUCCESS(status
) ? length
: 0);
242 if(status
==U_BUFFER_OVERFLOW_ERROR
) {
244 buffer
=dest
->getBuffer(length
);
245 length
=unorm_decompose(buffer
, dest
->getCapacity(),
246 source
.getBuffer(), source
.length(),
249 dest
->releaseBuffer(U_SUCCESS(status
) ? length
: 0);
252 if(dest
==&localDest
) {
255 if(U_FAILURE(status
)) {
261 UnicodeString
& U_EXPORT2
262 Normalizer::concatenate(UnicodeString
&left
, UnicodeString
&right
,
263 UnicodeString
&result
,
264 UNormalizationMode mode
, int32_t options
,
265 UErrorCode
&errorCode
) {
266 if(left
.isBogus() || right
.isBogus() || U_FAILURE(errorCode
)) {
268 if(U_SUCCESS(errorCode
)) {
269 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
272 UnicodeString localDest
;
275 if(&left
!=&result
&& &right
!=&result
) {
278 // the source and result strings are the same object, use a temporary one
282 UChar
*buffer
=dest
->getBuffer(left
.length()+right
.length());
283 int32_t length
=unorm_concatenate(left
.getBuffer(), left
.length(),
284 right
.getBuffer(), right
.length(),
285 buffer
, dest
->getCapacity(),
288 dest
->releaseBuffer(U_SUCCESS(errorCode
) ? length
: 0);
289 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
290 errorCode
=U_ZERO_ERROR
;
291 buffer
=dest
->getBuffer(length
);
292 int32_t length
=unorm_concatenate(left
.getBuffer(), left
.length(),
293 right
.getBuffer(), right
.length(),
294 buffer
, dest
->getCapacity(),
297 dest
->releaseBuffer(U_SUCCESS(errorCode
) ? length
: 0);
300 if(dest
==&localDest
) {
303 if(U_FAILURE(errorCode
)) {
310 //-------------------------------------------------------------------------
312 //-------------------------------------------------------------------------
315 * Return the current character in the normalized text.
317 UChar32
Normalizer::current() {
318 if(bufferPos
<buffer
.length() || nextNormalize()) {
319 return buffer
.char32At(bufferPos
);
326 * Return the next character in the normalized text and advance
327 * the iteration position by one. If the end
328 * of the text has already been reached, {@link #DONE} is returned.
330 UChar32
Normalizer::next() {
331 if(bufferPos
<buffer
.length() || nextNormalize()) {
332 UChar32 c
=buffer
.char32At(bufferPos
);
333 bufferPos
+=UTF_CHAR_LENGTH(c
);
341 * Return the previous character in the normalized text and decrement
342 * the iteration position by one. If the beginning
343 * of the text has already been reached, {@link #DONE} is returned.
345 UChar32
Normalizer::previous() {
346 if(bufferPos
>0 || previousNormalize()) {
347 UChar32 c
=buffer
.char32At(bufferPos
-1);
348 bufferPos
-=UTF_CHAR_LENGTH(c
);
355 void Normalizer::reset() {
356 currentIndex
=nextIndex
=text
->move(text
, 0, UITER_START
);
361 Normalizer::setIndexOnly(int32_t index
) {
362 currentIndex
=nextIndex
=text
->move(text
, index
, UITER_ZERO
); // validates index
367 * Return the first character in the normalized text-> This resets
368 * the <tt>Normalizer's</tt> position to the beginning of the text->
370 UChar32
Normalizer::first() {
376 * Return the last character in the normalized text-> This resets
377 * the <tt>Normalizer's</tt> position to be just before the
378 * the input text corresponding to that normalized character.
380 UChar32
Normalizer::last() {
381 currentIndex
=nextIndex
=text
->move(text
, 0, UITER_LIMIT
);
387 * Retrieve the current iteration position in the input text that is
388 * being normalized. This method is useful in applications such as
389 * searching, where you need to be able to determine the position in
390 * the input text that corresponds to a given normalized output character.
392 * <b>Note:</b> This method sets the position in the <em>input</em>, while
393 * {@link #next} and {@link #previous} iterate through characters in the
394 * <em>output</em>. This means that there is not necessarily a one-to-one
395 * correspondence between characters returned by <tt>next</tt> and
396 * <tt>previous</tt> and the indices passed to and returned from
397 * <tt>setIndex</tt> and {@link #getIndex}.
400 int32_t Normalizer::getIndex() const {
401 if(bufferPos
<buffer
.length()) {
409 * Retrieve the index of the start of the input text-> This is the begin index
410 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
411 * over which this <tt>Normalizer</tt> is iterating
413 int32_t Normalizer::startIndex() const {
414 return text
->getIndex(text
, UITER_START
);
418 * Retrieve the index of the end of the input text-> This is the end index
419 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
420 * over which this <tt>Normalizer</tt> is iterating
422 int32_t Normalizer::endIndex() const {
423 return text
->getIndex(text
, UITER_LIMIT
);
426 //-------------------------------------------------------------------------
427 // Property access methods
428 //-------------------------------------------------------------------------
431 Normalizer::setMode(UNormalizationMode newMode
)
437 Normalizer::getUMode() const
443 Normalizer::setOption(int32_t option
,
449 fOptions
&= (~option
);
454 Normalizer::getOption(int32_t option
) const
456 return (fOptions
& option
) != 0;
460 * Set the input text over which this <tt>Normalizer</tt> will iterate.
461 * The iteration position is set to the beginning of the input text->
464 Normalizer::setText(const UnicodeString
& newText
,
467 if (U_FAILURE(status
)) {
470 CharacterIterator
*newIter
= new StringCharacterIterator(newText
);
471 if (newIter
== NULL
) {
472 status
= U_MEMORY_ALLOCATION_ERROR
;
475 delete (CharacterIterator
*)(text
->context
);
476 text
->context
= newIter
;
481 * Set the input text over which this <tt>Normalizer</tt> will iterate.
482 * The iteration position is set to the beginning of the string.
485 Normalizer::setText(const CharacterIterator
& newText
,
488 if (U_FAILURE(status
)) {
491 CharacterIterator
*newIter
= newText
.clone();
492 if (newIter
== NULL
) {
493 status
= U_MEMORY_ALLOCATION_ERROR
;
496 delete (CharacterIterator
*)(text
->context
);
497 text
->context
= newIter
;
502 Normalizer::setText(const UChar
* newText
,
506 if (U_FAILURE(status
)) {
509 CharacterIterator
*newIter
= new UCharCharacterIterator(newText
, length
);
510 if (newIter
== NULL
) {
511 status
= U_MEMORY_ALLOCATION_ERROR
;
514 delete (CharacterIterator
*)(text
->context
);
515 text
->context
= newIter
;
520 * Copies the text under iteration into the UnicodeString referred to by "result".
521 * @param result Receives a copy of the text under iteration.
524 Normalizer::getText(UnicodeString
& result
)
526 ((CharacterIterator
*)(text
->context
))->getText(result
);
529 //-------------------------------------------------------------------------
530 // Private utility methods
531 //-------------------------------------------------------------------------
533 void Normalizer::clearBuffer() {
539 Normalizer::nextNormalize() {
542 UErrorCode errorCode
;
545 currentIndex
=nextIndex
;
546 text
->move(text
, nextIndex
, UITER_ZERO
);
547 if(!text
->hasNext(text
)) {
551 errorCode
=U_ZERO_ERROR
;
552 p
=buffer
.getBuffer(-1);
553 length
=unorm_next(text
, p
, buffer
.getCapacity(),
557 buffer
.releaseBuffer(U_SUCCESS(errorCode
) ? length
: 0);
558 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
559 errorCode
=U_ZERO_ERROR
;
560 text
->move(text
, nextIndex
, UITER_ZERO
);
561 p
=buffer
.getBuffer(length
);
562 length
=unorm_next(text
, p
, buffer
.getCapacity(),
566 buffer
.releaseBuffer(U_SUCCESS(errorCode
) ? length
: 0);
569 nextIndex
=text
->getIndex(text
, UITER_CURRENT
);
570 return U_SUCCESS(errorCode
) && !buffer
.isEmpty();
574 Normalizer::previousNormalize() {
577 UErrorCode errorCode
;
580 nextIndex
=currentIndex
;
581 text
->move(text
, currentIndex
, UITER_ZERO
);
582 if(!text
->hasPrevious(text
)) {
586 errorCode
=U_ZERO_ERROR
;
587 p
=buffer
.getBuffer(-1);
588 length
=unorm_previous(text
, p
, buffer
.getCapacity(),
592 buffer
.releaseBuffer(U_SUCCESS(errorCode
) ? length
: 0);
593 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
594 errorCode
=U_ZERO_ERROR
;
595 text
->move(text
, currentIndex
, UITER_ZERO
);
596 p
=buffer
.getBuffer(length
);
597 length
=unorm_previous(text
, p
, buffer
.getCapacity(),
601 buffer
.releaseBuffer(U_SUCCESS(errorCode
) ? length
: 0);
604 bufferPos
=buffer
.length();
605 currentIndex
=text
->getIndex(text
, UITER_CURRENT
);
606 return U_SUCCESS(errorCode
) && !buffer
.isEmpty();
611 #endif /* #if !UCONFIG_NO_NORMALIZATION */