2 *************************************************************************
4 * Copyright (c) 1996-2011, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 *************************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_NORMALIZATION
13 #include "unicode/uniset.h"
14 #include "unicode/unistr.h"
15 #include "unicode/chariter.h"
16 #include "unicode/schriter.h"
17 #include "unicode/uchriter.h"
18 #include "unicode/normlzr.h"
19 #include "unicode/utf16.h"
21 #include "normalizer2impl.h"
22 #include "uprops.h" // for uniset_getUnicode32Instance()
26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer
)
28 //-------------------------------------------------------------------------
29 // Constructors and other boilerplate
30 //-------------------------------------------------------------------------
32 Normalizer::Normalizer(const UnicodeString
& str
, UNormalizationMode mode
) :
33 UObject(), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(mode
), fOptions(0),
34 text(new StringCharacterIterator(str
)),
35 currentIndex(0), nextIndex(0),
36 buffer(), bufferPos(0)
41 Normalizer::Normalizer(const UChar
*str
, int32_t length
, UNormalizationMode mode
) :
42 UObject(), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(mode
), fOptions(0),
43 text(new UCharCharacterIterator(str
, length
)),
44 currentIndex(0), nextIndex(0),
45 buffer(), bufferPos(0)
50 Normalizer::Normalizer(const CharacterIterator
& iter
, UNormalizationMode mode
) :
51 UObject(), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(mode
), fOptions(0),
53 currentIndex(0), nextIndex(0),
54 buffer(), bufferPos(0)
59 Normalizer::Normalizer(const Normalizer
©
) :
60 UObject(copy
), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(copy
.fUMode
), fOptions(copy
.fOptions
),
61 text(copy
.text
->clone()),
62 currentIndex(copy
.currentIndex
), nextIndex(copy
.nextIndex
),
63 buffer(copy
.buffer
), bufferPos(copy
.bufferPos
)
68 static const UChar _NUL
=0;
72 UErrorCode errorCode
=U_ZERO_ERROR
;
73 fNorm2
=Normalizer2Factory::getInstance(fUMode
, errorCode
);
74 if(fOptions
&UNORM_UNICODE_3_2
) {
75 delete fFilteredNorm2
;
76 fNorm2
=fFilteredNorm2
=
77 new FilteredNormalizer2(*fNorm2
, *uniset_getUnicode32Instance(errorCode
));
79 if(U_FAILURE(errorCode
)) {
80 errorCode
=U_ZERO_ERROR
;
81 fNorm2
=Normalizer2Factory::getNoopInstance(errorCode
);
85 Normalizer::~Normalizer()
87 delete fFilteredNorm2
;
92 Normalizer::clone() const
94 return new Normalizer(*this);
98 * Generates a hash code for this iterator.
100 int32_t Normalizer::hashCode() const
102 return text
->hashCode() + fUMode
+ fOptions
+ buffer
.hashCode() + bufferPos
+ currentIndex
+ nextIndex
;
105 UBool
Normalizer::operator==(const Normalizer
& that
) const
109 (fUMode
==that
.fUMode
&&
110 fOptions
==that
.fOptions
&&
112 buffer
==that
.buffer
&&
113 bufferPos
==that
.bufferPos
&&
114 nextIndex
==that
.nextIndex
);
117 //-------------------------------------------------------------------------
118 // Static utility methods
119 //-------------------------------------------------------------------------
122 Normalizer::normalize(const UnicodeString
& source
,
123 UNormalizationMode mode
, int32_t options
,
124 UnicodeString
& result
,
125 UErrorCode
&status
) {
126 if(source
.isBogus() || U_FAILURE(status
)) {
128 if(U_SUCCESS(status
)) {
129 status
=U_ILLEGAL_ARGUMENT_ERROR
;
132 UnicodeString localDest
;
135 if(&source
!=&result
) {
138 // the source and result strings are the same object, use a temporary one
141 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, status
);
142 if(U_SUCCESS(status
)) {
143 if(options
&UNORM_UNICODE_3_2
) {
144 FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(status
)).
145 normalize(source
, *dest
, status
);
147 n2
->normalize(source
, *dest
, status
);
150 if(dest
==&localDest
&& U_SUCCESS(status
)) {
157 Normalizer::compose(const UnicodeString
& source
,
158 UBool compat
, int32_t options
,
159 UnicodeString
& result
,
160 UErrorCode
&status
) {
161 normalize(source
, compat
? UNORM_NFKC
: UNORM_NFC
, options
, result
, status
);
165 Normalizer::decompose(const UnicodeString
& source
,
166 UBool compat
, int32_t options
,
167 UnicodeString
& result
,
168 UErrorCode
&status
) {
169 normalize(source
, compat
? UNORM_NFKD
: UNORM_NFD
, options
, result
, status
);
172 UNormalizationCheckResult
173 Normalizer::quickCheck(const UnicodeString
& source
,
174 UNormalizationMode mode
, int32_t options
,
175 UErrorCode
&status
) {
176 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, status
);
177 if(U_SUCCESS(status
)) {
178 if(options
&UNORM_UNICODE_3_2
) {
179 return FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(status
)).
180 quickCheck(source
, status
);
182 return n2
->quickCheck(source
, status
);
190 Normalizer::isNormalized(const UnicodeString
& source
,
191 UNormalizationMode mode
, int32_t options
,
192 UErrorCode
&status
) {
193 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, status
);
194 if(U_SUCCESS(status
)) {
195 if(options
&UNORM_UNICODE_3_2
) {
196 return FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(status
)).
197 isNormalized(source
, status
);
199 return n2
->isNormalized(source
, status
);
206 UnicodeString
& U_EXPORT2
207 Normalizer::concatenate(const UnicodeString
&left
, const UnicodeString
&right
,
208 UnicodeString
&result
,
209 UNormalizationMode mode
, int32_t options
,
210 UErrorCode
&errorCode
) {
211 if(left
.isBogus() || right
.isBogus() || U_FAILURE(errorCode
)) {
213 if(U_SUCCESS(errorCode
)) {
214 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
217 UnicodeString localDest
;
220 if(&right
!=&result
) {
223 // the right and result strings are the same object, use a temporary one
227 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, errorCode
);
228 if(U_SUCCESS(errorCode
)) {
229 if(options
&UNORM_UNICODE_3_2
) {
230 FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(errorCode
)).
231 append(*dest
, right
, errorCode
);
233 n2
->append(*dest
, right
, errorCode
);
236 if(dest
==&localDest
&& U_SUCCESS(errorCode
)) {
243 //-------------------------------------------------------------------------
245 //-------------------------------------------------------------------------
248 * Return the current character in the normalized text.
250 UChar32
Normalizer::current() {
251 if(bufferPos
<buffer
.length() || nextNormalize()) {
252 return buffer
.char32At(bufferPos
);
259 * Return the next character in the normalized text and advance
260 * the iteration position by one. If the end
261 * of the text has already been reached, {@link #DONE} is returned.
263 UChar32
Normalizer::next() {
264 if(bufferPos
<buffer
.length() || nextNormalize()) {
265 UChar32 c
=buffer
.char32At(bufferPos
);
266 bufferPos
+=U16_LENGTH(c
);
274 * Return the previous character in the normalized text and decrement
275 * the iteration position by one. If the beginning
276 * of the text has already been reached, {@link #DONE} is returned.
278 UChar32
Normalizer::previous() {
279 if(bufferPos
>0 || previousNormalize()) {
280 UChar32 c
=buffer
.char32At(bufferPos
-1);
281 bufferPos
-=U16_LENGTH(c
);
288 void Normalizer::reset() {
289 currentIndex
=nextIndex
=text
->setToStart();
294 Normalizer::setIndexOnly(int32_t index
) {
295 text
->setIndex(index
); // pins index
296 currentIndex
=nextIndex
=text
->getIndex();
301 * Return the first character in the normalized text. This resets
302 * the <tt>Normalizer's</tt> position to the beginning of the text.
304 UChar32
Normalizer::first() {
310 * Return the last character in the normalized text. This resets
311 * the <tt>Normalizer's</tt> position to be just before the
312 * the input text corresponding to that normalized character.
314 UChar32
Normalizer::last() {
315 currentIndex
=nextIndex
=text
->setToEnd();
321 * Retrieve the current iteration position in the input text that is
322 * being normalized. This method is useful in applications such as
323 * searching, where you need to be able to determine the position in
324 * the input text that corresponds to a given normalized output character.
326 * <b>Note:</b> This method sets the position in the <em>input</em>, while
327 * {@link #next} and {@link #previous} iterate through characters in the
328 * <em>output</em>. This means that there is not necessarily a one-to-one
329 * correspondence between characters returned by <tt>next</tt> and
330 * <tt>previous</tt> and the indices passed to and returned from
331 * <tt>setIndex</tt> and {@link #getIndex}.
334 int32_t Normalizer::getIndex() const {
335 if(bufferPos
<buffer
.length()) {
343 * Retrieve the index of the start of the input text. This is the begin index
344 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
345 * over which this <tt>Normalizer</tt> is iterating
347 int32_t Normalizer::startIndex() const {
348 return text
->startIndex();
352 * Retrieve the index of the end of the input text. This is the end index
353 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
354 * over which this <tt>Normalizer</tt> is iterating
356 int32_t Normalizer::endIndex() const {
357 return text
->endIndex();
360 //-------------------------------------------------------------------------
361 // Property access methods
362 //-------------------------------------------------------------------------
365 Normalizer::setMode(UNormalizationMode newMode
)
372 Normalizer::getUMode() const
378 Normalizer::setOption(int32_t option
,
384 fOptions
&= (~option
);
390 Normalizer::getOption(int32_t option
) const
392 return (fOptions
& option
) != 0;
396 * Set the input text over which this <tt>Normalizer</tt> will iterate.
397 * The iteration position is set to the beginning of the input text.
400 Normalizer::setText(const UnicodeString
& newText
,
403 if (U_FAILURE(status
)) {
406 CharacterIterator
*newIter
= new StringCharacterIterator(newText
);
407 if (newIter
== NULL
) {
408 status
= U_MEMORY_ALLOCATION_ERROR
;
417 * Set the input text over which this <tt>Normalizer</tt> will iterate.
418 * The iteration position is set to the beginning of the string.
421 Normalizer::setText(const CharacterIterator
& newText
,
424 if (U_FAILURE(status
)) {
427 CharacterIterator
*newIter
= newText
.clone();
428 if (newIter
== NULL
) {
429 status
= U_MEMORY_ALLOCATION_ERROR
;
438 Normalizer::setText(const UChar
* newText
,
442 if (U_FAILURE(status
)) {
445 CharacterIterator
*newIter
= new UCharCharacterIterator(newText
, length
);
446 if (newIter
== NULL
) {
447 status
= U_MEMORY_ALLOCATION_ERROR
;
456 * Copies the text under iteration into the UnicodeString referred to by "result".
457 * @param result Receives a copy of the text under iteration.
460 Normalizer::getText(UnicodeString
& result
)
462 text
->getText(result
);
465 //-------------------------------------------------------------------------
466 // Private utility methods
467 //-------------------------------------------------------------------------
469 void Normalizer::clearBuffer() {
475 Normalizer::nextNormalize() {
477 currentIndex
=nextIndex
;
478 text
->setIndex(nextIndex
);
479 if(!text
->hasNext()) {
482 // Skip at least one character so we make progress.
483 UnicodeString
segment(text
->next32PostInc());
484 while(text
->hasNext()) {
486 if(fNorm2
->hasBoundaryBefore(c
=text
->next32PostInc())) {
487 text
->move32(-1, CharacterIterator::kCurrent
);
492 nextIndex
=text
->getIndex();
493 UErrorCode errorCode
=U_ZERO_ERROR
;
494 fNorm2
->normalize(segment
, buffer
, errorCode
);
495 return U_SUCCESS(errorCode
) && !buffer
.isEmpty();
499 Normalizer::previousNormalize() {
501 nextIndex
=currentIndex
;
502 text
->setIndex(currentIndex
);
503 if(!text
->hasPrevious()) {
506 UnicodeString segment
;
507 while(text
->hasPrevious()) {
508 UChar32 c
=text
->previous32();
509 segment
.insert(0, c
);
510 if(fNorm2
->hasBoundaryBefore(c
)) {
514 currentIndex
=text
->getIndex();
515 UErrorCode errorCode
=U_ZERO_ERROR
;
516 fNorm2
->normalize(segment
, buffer
, errorCode
);
517 bufferPos
=buffer
.length();
518 return U_SUCCESS(errorCode
) && !buffer
.isEmpty();
523 #endif /* #if !UCONFIG_NO_NORMALIZATION */