1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *************************************************************************
6 * Copyright (c) 1996-2012, International Business Machines Corporation and
7 * others. All Rights Reserved.
8 *************************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_NORMALIZATION
15 #include "unicode/uniset.h"
16 #include "unicode/unistr.h"
17 #include "unicode/chariter.h"
18 #include "unicode/schriter.h"
19 #include "unicode/uchriter.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/utf16.h"
23 #include "normalizer2impl.h"
24 #include "uprops.h" // for uniset_getUnicode32Instance()
27 // System can define move32 intrinsics, but the char iters define move32 method
28 // using same undef trick in headers, so undef here to re-enable the method.
34 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer
)
36 //-------------------------------------------------------------------------
37 // Constructors and other boilerplate
38 //-------------------------------------------------------------------------
40 Normalizer::Normalizer(const UnicodeString
& str
, UNormalizationMode mode
) :
41 UObject(), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(mode
), fOptions(0),
42 text(new StringCharacterIterator(str
)),
43 currentIndex(0), nextIndex(0),
44 buffer(), bufferPos(0)
49 Normalizer::Normalizer(ConstChar16Ptr str
, int32_t length
, UNormalizationMode mode
) :
50 UObject(), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(mode
), fOptions(0),
51 text(new UCharCharacterIterator(str
, length
)),
52 currentIndex(0), nextIndex(0),
53 buffer(), bufferPos(0)
58 Normalizer::Normalizer(const CharacterIterator
& iter
, UNormalizationMode mode
) :
59 UObject(), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(mode
), fOptions(0),
61 currentIndex(0), nextIndex(0),
62 buffer(), bufferPos(0)
67 Normalizer::Normalizer(const Normalizer
©
) :
68 UObject(copy
), fFilteredNorm2(NULL
), fNorm2(NULL
), fUMode(copy
.fUMode
), fOptions(copy
.fOptions
),
69 text(copy
.text
->clone()),
70 currentIndex(copy
.currentIndex
), nextIndex(copy
.nextIndex
),
71 buffer(copy
.buffer
), bufferPos(copy
.bufferPos
)
78 UErrorCode errorCode
=U_ZERO_ERROR
;
79 fNorm2
=Normalizer2Factory::getInstance(fUMode
, errorCode
);
80 if(fOptions
&UNORM_UNICODE_3_2
) {
81 delete fFilteredNorm2
;
82 fNorm2
=fFilteredNorm2
=
83 new FilteredNormalizer2(*fNorm2
, *uniset_getUnicode32Instance(errorCode
));
85 if(U_FAILURE(errorCode
)) {
86 errorCode
=U_ZERO_ERROR
;
87 fNorm2
=Normalizer2Factory::getNoopInstance(errorCode
);
91 Normalizer::~Normalizer()
93 delete fFilteredNorm2
;
98 Normalizer::clone() const
100 return new Normalizer(*this);
104 * Generates a hash code for this iterator.
106 int32_t Normalizer::hashCode() const
108 return text
->hashCode() + fUMode
+ fOptions
+ buffer
.hashCode() + bufferPos
+ currentIndex
+ nextIndex
;
111 UBool
Normalizer::operator==(const Normalizer
& that
) const
115 (fUMode
==that
.fUMode
&&
116 fOptions
==that
.fOptions
&&
118 buffer
==that
.buffer
&&
119 bufferPos
==that
.bufferPos
&&
120 nextIndex
==that
.nextIndex
);
123 //-------------------------------------------------------------------------
124 // Static utility methods
125 //-------------------------------------------------------------------------
128 Normalizer::normalize(const UnicodeString
& source
,
129 UNormalizationMode mode
, int32_t options
,
130 UnicodeString
& result
,
131 UErrorCode
&status
) {
132 if(source
.isBogus() || U_FAILURE(status
)) {
134 if(U_SUCCESS(status
)) {
135 status
=U_ILLEGAL_ARGUMENT_ERROR
;
138 UnicodeString localDest
;
141 if(&source
!=&result
) {
144 // the source and result strings are the same object, use a temporary one
147 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, status
);
148 if(U_SUCCESS(status
)) {
149 if(options
&UNORM_UNICODE_3_2
) {
150 FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(status
)).
151 normalize(source
, *dest
, status
);
153 n2
->normalize(source
, *dest
, status
);
156 if(dest
==&localDest
&& U_SUCCESS(status
)) {
163 Normalizer::compose(const UnicodeString
& source
,
164 UBool compat
, int32_t options
,
165 UnicodeString
& result
,
166 UErrorCode
&status
) {
167 normalize(source
, compat
? UNORM_NFKC
: UNORM_NFC
, options
, result
, status
);
171 Normalizer::decompose(const UnicodeString
& source
,
172 UBool compat
, int32_t options
,
173 UnicodeString
& result
,
174 UErrorCode
&status
) {
175 normalize(source
, compat
? UNORM_NFKD
: UNORM_NFD
, options
, result
, status
);
178 UNormalizationCheckResult
179 Normalizer::quickCheck(const UnicodeString
& source
,
180 UNormalizationMode mode
, int32_t options
,
181 UErrorCode
&status
) {
182 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, status
);
183 if(U_SUCCESS(status
)) {
184 if(options
&UNORM_UNICODE_3_2
) {
185 return FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(status
)).
186 quickCheck(source
, status
);
188 return n2
->quickCheck(source
, status
);
196 Normalizer::isNormalized(const UnicodeString
& source
,
197 UNormalizationMode mode
, int32_t options
,
198 UErrorCode
&status
) {
199 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, status
);
200 if(U_SUCCESS(status
)) {
201 if(options
&UNORM_UNICODE_3_2
) {
202 return FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(status
)).
203 isNormalized(source
, status
);
205 return n2
->isNormalized(source
, status
);
212 UnicodeString
& U_EXPORT2
213 Normalizer::concatenate(const UnicodeString
&left
, const UnicodeString
&right
,
214 UnicodeString
&result
,
215 UNormalizationMode mode
, int32_t options
,
216 UErrorCode
&errorCode
) {
217 if(left
.isBogus() || right
.isBogus() || U_FAILURE(errorCode
)) {
219 if(U_SUCCESS(errorCode
)) {
220 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
223 UnicodeString localDest
;
226 if(&right
!=&result
) {
229 // the right and result strings are the same object, use a temporary one
233 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, errorCode
);
234 if(U_SUCCESS(errorCode
)) {
235 if(options
&UNORM_UNICODE_3_2
) {
236 FilteredNormalizer2(*n2
, *uniset_getUnicode32Instance(errorCode
)).
237 append(*dest
, right
, errorCode
);
239 n2
->append(*dest
, right
, errorCode
);
242 if(dest
==&localDest
&& U_SUCCESS(errorCode
)) {
249 //-------------------------------------------------------------------------
251 //-------------------------------------------------------------------------
254 * Return the current character in the normalized text.
256 UChar32
Normalizer::current() {
257 if(bufferPos
<buffer
.length() || nextNormalize()) {
258 return buffer
.char32At(bufferPos
);
265 * Return the next character in the normalized text and advance
266 * the iteration position by one. If the end
267 * of the text has already been reached, {@link #DONE} is returned.
269 UChar32
Normalizer::next() {
270 if(bufferPos
<buffer
.length() || nextNormalize()) {
271 UChar32 c
=buffer
.char32At(bufferPos
);
272 bufferPos
+=U16_LENGTH(c
);
280 * Return the previous character in the normalized text and decrement
281 * the iteration position by one. If the beginning
282 * of the text has already been reached, {@link #DONE} is returned.
284 UChar32
Normalizer::previous() {
285 if(bufferPos
>0 || previousNormalize()) {
286 UChar32 c
=buffer
.char32At(bufferPos
-1);
287 bufferPos
-=U16_LENGTH(c
);
294 void Normalizer::reset() {
295 currentIndex
=nextIndex
=text
->setToStart();
300 Normalizer::setIndexOnly(int32_t index
) {
301 text
->setIndex(index
); // pins index
302 currentIndex
=nextIndex
=text
->getIndex();
307 * Return the first character in the normalized text. This resets
308 * the <tt>Normalizer's</tt> position to the beginning of the text.
310 UChar32
Normalizer::first() {
316 * Return the last character in the normalized text. This resets
317 * the <tt>Normalizer's</tt> position to be just before the
318 * the input text corresponding to that normalized character.
320 UChar32
Normalizer::last() {
321 currentIndex
=nextIndex
=text
->setToEnd();
327 * Retrieve the current iteration position in the input text that is
328 * being normalized. This method is useful in applications such as
329 * searching, where you need to be able to determine the position in
330 * the input text that corresponds to a given normalized output character.
332 * <b>Note:</b> This method sets the position in the <em>input</em>, while
333 * {@link #next} and {@link #previous} iterate through characters in the
334 * <em>output</em>. This means that there is not necessarily a one-to-one
335 * correspondence between characters returned by <tt>next</tt> and
336 * <tt>previous</tt> and the indices passed to and returned from
337 * <tt>setIndex</tt> and {@link #getIndex}.
340 int32_t Normalizer::getIndex() const {
341 if(bufferPos
<buffer
.length()) {
349 * Retrieve the index of the start of the input text. This is the begin index
350 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
351 * over which this <tt>Normalizer</tt> is iterating
353 int32_t Normalizer::startIndex() const {
354 return text
->startIndex();
358 * Retrieve the index of the end of the input text. This is the end index
359 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
360 * over which this <tt>Normalizer</tt> is iterating
362 int32_t Normalizer::endIndex() const {
363 return text
->endIndex();
366 //-------------------------------------------------------------------------
367 // Property access methods
368 //-------------------------------------------------------------------------
371 Normalizer::setMode(UNormalizationMode newMode
)
378 Normalizer::getUMode() const
384 Normalizer::setOption(int32_t option
,
390 fOptions
&= (~option
);
396 Normalizer::getOption(int32_t option
) const
398 return (fOptions
& option
) != 0;
402 * Set the input text over which this <tt>Normalizer</tt> will iterate.
403 * The iteration position is set to the beginning of the input text.
406 Normalizer::setText(const UnicodeString
& newText
,
409 if (U_FAILURE(status
)) {
412 CharacterIterator
*newIter
= new StringCharacterIterator(newText
);
413 if (newIter
== NULL
) {
414 status
= U_MEMORY_ALLOCATION_ERROR
;
423 * Set the input text over which this <tt>Normalizer</tt> will iterate.
424 * The iteration position is set to the beginning of the string.
427 Normalizer::setText(const CharacterIterator
& newText
,
430 if (U_FAILURE(status
)) {
433 CharacterIterator
*newIter
= newText
.clone();
434 if (newIter
== NULL
) {
435 status
= U_MEMORY_ALLOCATION_ERROR
;
444 Normalizer::setText(ConstChar16Ptr newText
,
448 if (U_FAILURE(status
)) {
451 CharacterIterator
*newIter
= new UCharCharacterIterator(newText
, length
);
452 if (newIter
== NULL
) {
453 status
= U_MEMORY_ALLOCATION_ERROR
;
462 * Copies the text under iteration into the UnicodeString referred to by "result".
463 * @param result Receives a copy of the text under iteration.
466 Normalizer::getText(UnicodeString
& result
)
468 text
->getText(result
);
471 //-------------------------------------------------------------------------
472 // Private utility methods
473 //-------------------------------------------------------------------------
475 void Normalizer::clearBuffer() {
481 Normalizer::nextNormalize() {
483 currentIndex
=nextIndex
;
484 text
->setIndex(nextIndex
);
485 if(!text
->hasNext()) {
488 // Skip at least one character so we make progress.
489 UnicodeString
segment(text
->next32PostInc());
490 while(text
->hasNext()) {
492 if(fNorm2
->hasBoundaryBefore(c
=text
->next32PostInc())) {
493 text
->move32(-1, CharacterIterator::kCurrent
);
498 nextIndex
=text
->getIndex();
499 UErrorCode errorCode
=U_ZERO_ERROR
;
500 fNorm2
->normalize(segment
, buffer
, errorCode
);
501 return U_SUCCESS(errorCode
) && !buffer
.isEmpty();
505 Normalizer::previousNormalize() {
507 nextIndex
=currentIndex
;
508 text
->setIndex(currentIndex
);
509 if(!text
->hasPrevious()) {
512 UnicodeString segment
;
513 while(text
->hasPrevious()) {
514 UChar32 c
=text
->previous32();
515 segment
.insert(0, c
);
516 if(fNorm2
->hasBoundaryBefore(c
)) {
520 currentIndex
=text
->getIndex();
521 UErrorCode errorCode
=U_ZERO_ERROR
;
522 fNorm2
->normalize(segment
, buffer
, errorCode
);
523 bufferPos
=buffer
.length();
524 return U_SUCCESS(errorCode
) && !buffer
.isEmpty();
529 #endif /* #if !UCONFIG_NO_NORMALIZATION */