2 *************************************************************************
4 * Copyright (c) 1996-2010, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 *************************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_NORMALIZATION
13 #include "unicode/uniset.h"
14 #include "unicode/unistr.h"
15 #include "unicode/chariter.h"
16 #include "unicode/schriter.h"
17 #include "unicode/uchriter.h"
18 #include "unicode/normlzr.h"
20 #include "normalizer2impl.h"
21 #include "uprops.h" // for uniset_getUnicode32Instance()
25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer
)
27 //-------------------------------------------------------------------------
28 // Constructors and other boilerplate
29 //-------------------------------------------------------------------------
31 Normalizer::Normalizer(const UnicodeString
& str
, UNormalizationMode mode
) :
32 UObject(), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(mode
), fOptions(0),
33 text(new StringCharacterIterator(str
)),
34 currentIndex(0), nextIndex(0),
35 buffer(), bufferPos(0)
40 Normalizer::Normalizer(const UChar
*str
, int32_t length
, UNormalizationMode mode
) :
41 UObject(), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(mode
), fOptions(0),
42 text(new UCharCharacterIterator(str
, length
)),
43 currentIndex(0), nextIndex(0),
44 buffer(), bufferPos(0)
49 Normalizer::Normalizer(const CharacterIterator
& iter
, UNormalizationMode mode
) :
50 UObject(), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(mode
), fOptions(0),
52 currentIndex(0), nextIndex(0),
53 buffer(), bufferPos(0)
58 Normalizer::Normalizer(const Normalizer
©
) :
59 UObject(copy
), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(copy
.fUMode
), fOptions(copy
.fOptions
),
60 text(copy
.text
->clone()),
61 currentIndex(copy
.currentIndex
), nextIndex(copy
.nextIndex
),
62 buffer(copy
.buffer
), bufferPos(copy
.bufferPos
)
67 static const UChar _NUL
=0;
71 UErrorCode errorCode
=U_ZERO_ERROR
;
72 fNorm2
=Normalizer2Factory::getInstance(fUMode
, errorCode
);
73 if(fOptions
&UNORM_UNICODE_3_2
) {
74 delete fFilteredNorm2
;
75 fNorm2
=fFilteredNorm2
=
76 new FilteredNormalizer2(*fNorm2
, *uniset_getUnicode32Instance(errorCode
));
78 if(U_FAILURE(errorCode
)) {
79 errorCode
=U_ZERO_ERROR
;
80 fNorm2
=Normalizer2Factory::getNoopInstance(errorCode
);
84 Normalizer::~Normalizer()
86 delete fFilteredNorm2
;
91 Normalizer::clone() const
93 return new Normalizer(*this);
97 * Generates a hash code for this iterator.
99 int32_t Normalizer::hashCode() const
101 return text
->hashCode() + fUMode
+ fOptions
+ buffer
.hashCode() + bufferPos
+ currentIndex
+ nextIndex
;
104 UBool
Normalizer::operator==(const Normalizer
& that
) const
108 (fUMode
==that
.fUMode
&&
109 fOptions
==that
.fOptions
&&
111 buffer
==that
.buffer
&&
112 bufferPos
==that
.bufferPos
&&
113 nextIndex
==that
.nextIndex
);
116 //-------------------------------------------------------------------------
117 // Static utility methods
118 //-------------------------------------------------------------------------
121 Normalizer::normalize(const UnicodeString
& source
,
122 UNormalizationMode mode
, int32_t options
,
123 UnicodeString
& result
,
124 UErrorCode
&status
) {
125 if(source
.isBogus() || U_FAILURE(status
)) {
127 if(U_SUCCESS(status
)) {
128 status
=U_ILLEGAL_ARGUMENT_ERROR
;
131 UnicodeString localDest
;
134 if(&source
!=&result
) {
137 // the source and result strings are the same object, use a temporary one
140 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, status
);
141 if(U_SUCCESS(status
)) {
142 if(options
&UNORM_UNICODE_3_2
) {
143 FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(status
)).
144 normalize(source
, *dest
, status
);
146 n2
->normalize(source
, *dest
, status
);
149 if(dest
==&localDest
&& U_SUCCESS(status
)) {
156 Normalizer::compose(const UnicodeString
& source
,
157 UBool compat
, int32_t options
,
158 UnicodeString
& result
,
159 UErrorCode
&status
) {
160 normalize(source
, compat
? UNORM_NFKC
: UNORM_NFC
, options
, result
, status
);
164 Normalizer::decompose(const UnicodeString
& source
,
165 UBool compat
, int32_t options
,
166 UnicodeString
& result
,
167 UErrorCode
&status
) {
168 normalize(source
, compat
? UNORM_NFKD
: UNORM_NFD
, options
, result
, status
);
171 UNormalizationCheckResult
172 Normalizer::quickCheck(const UnicodeString
& source
,
173 UNormalizationMode mode
, int32_t options
,
174 UErrorCode
&status
) {
175 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, status
);
176 if(U_SUCCESS(status
)) {
177 if(options
&UNORM_UNICODE_3_2
) {
178 return FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(status
)).
179 quickCheck(source
, status
);
181 return n2
->quickCheck(source
, status
);
189 Normalizer::isNormalized(const UnicodeString
& source
,
190 UNormalizationMode mode
, int32_t options
,
191 UErrorCode
&status
) {
192 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, status
);
193 if(U_SUCCESS(status
)) {
194 if(options
&UNORM_UNICODE_3_2
) {
195 return FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(status
)).
196 isNormalized(source
, status
);
198 return n2
->isNormalized(source
, status
);
205 UnicodeString
& U_EXPORT2
206 Normalizer::concatenate(UnicodeString
&left
, UnicodeString
&right
,
207 UnicodeString
&result
,
208 UNormalizationMode mode
, int32_t options
,
209 UErrorCode
&errorCode
) {
210 if(left
.isBogus() || right
.isBogus() || U_FAILURE(errorCode
)) {
212 if(U_SUCCESS(errorCode
)) {
213 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
216 UnicodeString localDest
;
219 if(&right
!=&result
) {
222 // the right and result strings are the same object, use a temporary one
226 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, errorCode
);
227 if(U_SUCCESS(errorCode
)) {
228 if(options
&UNORM_UNICODE_3_2
) {
229 FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(errorCode
)).
230 append(*dest
, right
, errorCode
);
232 n2
->append(*dest
, right
, errorCode
);
235 if(dest
==&localDest
&& U_SUCCESS(errorCode
)) {
242 //-------------------------------------------------------------------------
244 //-------------------------------------------------------------------------
247 * Return the current character in the normalized text.
249 UChar32
Normalizer::current() {
250 if(bufferPos
<buffer
.length() || nextNormalize()) {
251 return buffer
.char32At(bufferPos
);
258 * Return the next character in the normalized text and advance
259 * the iteration position by one. If the end
260 * of the text has already been reached, {@link #DONE} is returned.
262 UChar32
Normalizer::next() {
263 if(bufferPos
<buffer
.length() || nextNormalize()) {
264 UChar32 c
=buffer
.char32At(bufferPos
);
265 bufferPos
+=UTF_CHAR_LENGTH(c
);
273 * Return the previous character in the normalized text and decrement
274 * the iteration position by one. If the beginning
275 * of the text has already been reached, {@link #DONE} is returned.
277 UChar32
Normalizer::previous() {
278 if(bufferPos
>0 || previousNormalize()) {
279 UChar32 c
=buffer
.char32At(bufferPos
-1);
280 bufferPos
-=UTF_CHAR_LENGTH(c
);
287 void Normalizer::reset() {
288 currentIndex
=nextIndex
=text
->setToStart();
293 Normalizer::setIndexOnly(int32_t index
) {
294 text
->setIndex(index
); // pins index
295 currentIndex
=nextIndex
=text
->getIndex();
300 * Return the first character in the normalized text. This resets
301 * the <tt>Normalizer's</tt> position to the beginning of the text.
303 UChar32
Normalizer::first() {
309 * Return the last character in the normalized text. This resets
310 * the <tt>Normalizer's</tt> position to be just before the
311 * the input text corresponding to that normalized character.
313 UChar32
Normalizer::last() {
314 currentIndex
=nextIndex
=text
->setToEnd();
320 * Retrieve the current iteration position in the input text that is
321 * being normalized. This method is useful in applications such as
322 * searching, where you need to be able to determine the position in
323 * the input text that corresponds to a given normalized output character.
325 * <b>Note:</b> This method sets the position in the <em>input</em>, while
326 * {@link #next} and {@link #previous} iterate through characters in the
327 * <em>output</em>. This means that there is not necessarily a one-to-one
328 * correspondence between characters returned by <tt>next</tt> and
329 * <tt>previous</tt> and the indices passed to and returned from
330 * <tt>setIndex</tt> and {@link #getIndex}.
333 int32_t Normalizer::getIndex() const {
334 if(bufferPos
<buffer
.length()) {
342 * Retrieve the index of the start of the input text. This is the begin index
343 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
344 * over which this <tt>Normalizer</tt> is iterating
346 int32_t Normalizer::startIndex() const {
347 return text
->startIndex();
351 * Retrieve the index of the end of the input text. This is the end index
352 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
353 * over which this <tt>Normalizer</tt> is iterating
355 int32_t Normalizer::endIndex() const {
356 return text
->endIndex();
359 //-------------------------------------------------------------------------
360 // Property access methods
361 //-------------------------------------------------------------------------
364 Normalizer::setMode(UNormalizationMode newMode
)
371 Normalizer::getUMode() const
377 Normalizer::setOption(int32_t option
,
383 fOptions
&= (~option
);
389 Normalizer::getOption(int32_t option
) const
391 return (fOptions
& option
) != 0;
395 * Set the input text over which this <tt>Normalizer</tt> will iterate.
396 * The iteration position is set to the beginning of the input text.
399 Normalizer::setText(const UnicodeString
& newText
,
402 if (U_FAILURE(status
)) {
405 CharacterIterator
*newIter
= new StringCharacterIterator(newText
);
406 if (newIter
== NULL
) {
407 status
= U_MEMORY_ALLOCATION_ERROR
;
416 * Set the input text over which this <tt>Normalizer</tt> will iterate.
417 * The iteration position is set to the beginning of the string.
420 Normalizer::setText(const CharacterIterator
& newText
,
423 if (U_FAILURE(status
)) {
426 CharacterIterator
*newIter
= newText
.clone();
427 if (newIter
== NULL
) {
428 status
= U_MEMORY_ALLOCATION_ERROR
;
437 Normalizer::setText(const UChar
* newText
,
441 if (U_FAILURE(status
)) {
444 CharacterIterator
*newIter
= new UCharCharacterIterator(newText
, length
);
445 if (newIter
== NULL
) {
446 status
= U_MEMORY_ALLOCATION_ERROR
;
455 * Copies the text under iteration into the UnicodeString referred to by "result".
456 * @param result Receives a copy of the text under iteration.
459 Normalizer::getText(UnicodeString
& result
)
461 text
->getText(result
);
464 //-------------------------------------------------------------------------
465 // Private utility methods
466 //-------------------------------------------------------------------------
468 void Normalizer::clearBuffer() {
474 Normalizer::nextNormalize() {
476 currentIndex
=nextIndex
;
477 text
->setIndex(nextIndex
);
478 if(!text
->hasNext()) {
481 // Skip at least one character so we make progress.
482 UnicodeString
segment(text
->next32PostInc());
483 while(text
->hasNext()) {
485 if(fNorm2
->hasBoundaryBefore(c
=text
->next32PostInc())) {
486 text
->move32(-1, CharacterIterator::kCurrent
);
491 nextIndex
=text
->getIndex();
492 UErrorCode errorCode
=U_ZERO_ERROR
;
493 fNorm2
->normalize(segment
, buffer
, errorCode
);
494 return U_SUCCESS(errorCode
) && !buffer
.isEmpty();
498 Normalizer::previousNormalize() {
500 nextIndex
=currentIndex
;
501 text
->setIndex(currentIndex
);
502 if(!text
->hasPrevious()) {
505 UnicodeString segment
;
506 while(text
->hasPrevious()) {
507 UChar32 c
=text
->previous32();
508 segment
.insert(0, c
);
509 if(fNorm2
->hasBoundaryBefore(c
)) {
513 currentIndex
=text
->getIndex();
514 UErrorCode errorCode
=U_ZERO_ERROR
;
515 fNorm2
->normalize(segment
, buffer
, errorCode
);
516 bufferPos
=buffer
.length();
517 return U_SUCCESS(errorCode
) && !buffer
.isEmpty();
522 #endif /* #if !UCONFIG_NO_NORMALIZATION */