]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/normlzr.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / common / normlzr.cpp
CommitLineData
b75a7d8f
A
1/*
2 *************************************************************************
3 * COPYRIGHT:
729e4ab9 4 * Copyright (c) 1996-2010, International Business Machines Corporation and
b75a7d8f
A
5 * others. All Rights Reserved.
6 *************************************************************************
7 */
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_NORMALIZATION
12
729e4ab9 13#include "unicode/uniset.h"
b75a7d8f
A
14#include "unicode/unistr.h"
15#include "unicode/chariter.h"
16#include "unicode/schriter.h"
17#include "unicode/uchriter.h"
b75a7d8f
A
18#include "unicode/normlzr.h"
19#include "cmemory.h"
729e4ab9
A
20#include "normalizer2impl.h"
21#include "uprops.h" // for uniset_getUnicode32Instance()
b75a7d8f
A
22
23U_NAMESPACE_BEGIN
24
374ca955 25UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
b75a7d8f
A
26
27//-------------------------------------------------------------------------
28// Constructors and other boilerplate
29//-------------------------------------------------------------------------
30
31Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
729e4ab9
A
32 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
33 text(new StringCharacterIterator(str)),
b75a7d8f
A
34 currentIndex(0), nextIndex(0),
35 buffer(), bufferPos(0)
36{
729e4ab9 37 init();
b75a7d8f
A
38}
39
40Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
729e4ab9
A
41 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
42 text(new UCharCharacterIterator(str, length)),
b75a7d8f
A
43 currentIndex(0), nextIndex(0),
44 buffer(), bufferPos(0)
45{
729e4ab9 46 init();
b75a7d8f
A
47}
48
49Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
729e4ab9
A
50 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
51 text(iter.clone()),
b75a7d8f
A
52 currentIndex(0), nextIndex(0),
53 buffer(), bufferPos(0)
54{
729e4ab9 55 init();
b75a7d8f
A
56}
57
58Normalizer::Normalizer(const Normalizer &copy) :
729e4ab9
A
59 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
60 text(copy.text->clone()),
b75a7d8f
A
61 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
62 buffer(copy.buffer), bufferPos(copy.bufferPos)
63{
729e4ab9 64 init();
b75a7d8f
A
65}
66
67static const UChar _NUL=0;
68
69void
729e4ab9 70Normalizer::init() {
b75a7d8f 71 UErrorCode errorCode=U_ZERO_ERROR;
729e4ab9
A
72 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
73 if(fOptions&UNORM_UNICODE_3_2) {
74 delete fFilteredNorm2;
75 fNorm2=fFilteredNorm2=
76 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
77 }
78 if(U_FAILURE(errorCode)) {
79 errorCode=U_ZERO_ERROR;
80 fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
b75a7d8f
A
81 }
82}
83
84Normalizer::~Normalizer()
85{
729e4ab9
A
86 delete fFilteredNorm2;
87 delete text;
b75a7d8f
A
88}
89
90Normalizer*
91Normalizer::clone() const
92{
729e4ab9 93 return new Normalizer(*this);
b75a7d8f
A
94}
95
96/**
97 * Generates a hash code for this iterator.
98 */
99int32_t Normalizer::hashCode() const
100{
729e4ab9 101 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
b75a7d8f
A
102}
103
104UBool Normalizer::operator==(const Normalizer& that) const
105{
106 return
107 this==&that ||
729e4ab9 108 (fUMode==that.fUMode &&
b75a7d8f 109 fOptions==that.fOptions &&
729e4ab9 110 *text==*that.text &&
b75a7d8f
A
111 buffer==that.buffer &&
112 bufferPos==that.bufferPos &&
729e4ab9 113 nextIndex==that.nextIndex);
b75a7d8f
A
114}
115
116//-------------------------------------------------------------------------
117// Static utility methods
118//-------------------------------------------------------------------------
119
374ca955 120void U_EXPORT2
b75a7d8f
A
121Normalizer::normalize(const UnicodeString& source,
122 UNormalizationMode mode, int32_t options,
123 UnicodeString& result,
124 UErrorCode &status) {
125 if(source.isBogus() || U_FAILURE(status)) {
126 result.setToBogus();
127 if(U_SUCCESS(status)) {
128 status=U_ILLEGAL_ARGUMENT_ERROR;
129 }
130 } else {
131 UnicodeString localDest;
132 UnicodeString *dest;
133
134 if(&source!=&result) {
135 dest=&result;
136 } else {
137 // the source and result strings are the same object, use a temporary one
138 dest=&localDest;
139 }
729e4ab9
A
140 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
141 if(U_SUCCESS(status)) {
142 if(options&UNORM_UNICODE_3_2) {
143 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
144 normalize(source, *dest, status);
145 } else {
146 n2->normalize(source, *dest, status);
147 }
b75a7d8f 148 }
729e4ab9 149 if(dest==&localDest && U_SUCCESS(status)) {
b75a7d8f
A
150 result=*dest;
151 }
b75a7d8f
A
152 }
153}
154
374ca955 155void U_EXPORT2
b75a7d8f
A
156Normalizer::compose(const UnicodeString& source,
157 UBool compat, int32_t options,
158 UnicodeString& result,
159 UErrorCode &status) {
729e4ab9 160 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
b75a7d8f
A
161}
162
374ca955 163void U_EXPORT2
b75a7d8f
A
164Normalizer::decompose(const UnicodeString& source,
165 UBool compat, int32_t options,
166 UnicodeString& result,
167 UErrorCode &status) {
729e4ab9
A
168 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
169}
170
171UNormalizationCheckResult
172Normalizer::quickCheck(const UnicodeString& source,
173 UNormalizationMode mode, int32_t options,
174 UErrorCode &status) {
175 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
176 if(U_SUCCESS(status)) {
177 if(options&UNORM_UNICODE_3_2) {
178 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
179 quickCheck(source, status);
180 } else {
181 return n2->quickCheck(source, status);
b75a7d8f
A
182 }
183 } else {
729e4ab9
A
184 return UNORM_MAYBE;
185 }
186}
b75a7d8f 187
729e4ab9
A
188UBool
189Normalizer::isNormalized(const UnicodeString& source,
190 UNormalizationMode mode, int32_t options,
191 UErrorCode &status) {
192 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
193 if(U_SUCCESS(status)) {
194 if(options&UNORM_UNICODE_3_2) {
195 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
196 isNormalized(source, status);
b75a7d8f 197 } else {
729e4ab9 198 return n2->isNormalized(source, status);
b75a7d8f 199 }
729e4ab9
A
200 } else {
201 return FALSE;
b75a7d8f
A
202 }
203}
204
374ca955 205UnicodeString & U_EXPORT2
b75a7d8f
A
206Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
207 UnicodeString &result,
208 UNormalizationMode mode, int32_t options,
209 UErrorCode &errorCode) {
210 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
211 result.setToBogus();
212 if(U_SUCCESS(errorCode)) {
213 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
214 }
215 } else {
216 UnicodeString localDest;
217 UnicodeString *dest;
218
729e4ab9 219 if(&right!=&result) {
b75a7d8f
A
220 dest=&result;
221 } else {
729e4ab9 222 // the right and result strings are the same object, use a temporary one
b75a7d8f
A
223 dest=&localDest;
224 }
729e4ab9
A
225 *dest=left;
226 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
227 if(U_SUCCESS(errorCode)) {
228 if(options&UNORM_UNICODE_3_2) {
229 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
230 append(*dest, right, errorCode);
231 } else {
232 n2->append(*dest, right, errorCode);
233 }
b75a7d8f 234 }
729e4ab9 235 if(dest==&localDest && U_SUCCESS(errorCode)) {
b75a7d8f
A
236 result=*dest;
237 }
b75a7d8f
A
238 }
239 return result;
240}
241
242//-------------------------------------------------------------------------
243// Iteration API
244//-------------------------------------------------------------------------
245
246/**
247 * Return the current character in the normalized text.
248 */
249UChar32 Normalizer::current() {
250 if(bufferPos<buffer.length() || nextNormalize()) {
251 return buffer.char32At(bufferPos);
252 } else {
253 return DONE;
254 }
255}
256
257/**
258 * Return the next character in the normalized text and advance
259 * the iteration position by one. If the end
260 * of the text has already been reached, {@link #DONE} is returned.
261 */
262UChar32 Normalizer::next() {
263 if(bufferPos<buffer.length() || nextNormalize()) {
264 UChar32 c=buffer.char32At(bufferPos);
265 bufferPos+=UTF_CHAR_LENGTH(c);
266 return c;
267 } else {
268 return DONE;
269 }
270}
271
272/**
273 * Return the previous character in the normalized text and decrement
274 * the iteration position by one. If the beginning
275 * of the text has already been reached, {@link #DONE} is returned.
276 */
277UChar32 Normalizer::previous() {
278 if(bufferPos>0 || previousNormalize()) {
279 UChar32 c=buffer.char32At(bufferPos-1);
280 bufferPos-=UTF_CHAR_LENGTH(c);
281 return c;
282 } else {
283 return DONE;
284 }
285}
286
287void Normalizer::reset() {
729e4ab9 288 currentIndex=nextIndex=text->setToStart();
b75a7d8f
A
289 clearBuffer();
290}
291
292void
293Normalizer::setIndexOnly(int32_t index) {
729e4ab9
A
294 text->setIndex(index); // pins index
295 currentIndex=nextIndex=text->getIndex();
b75a7d8f
A
296 clearBuffer();
297}
298
299/**
729e4ab9
A
300 * Return the first character in the normalized text. This resets
301 * the <tt>Normalizer's</tt> position to the beginning of the text.
b75a7d8f
A
302 */
303UChar32 Normalizer::first() {
304 reset();
305 return next();
306}
307
308/**
729e4ab9 309 * Return the last character in the normalized text. This resets
b75a7d8f
A
310 * the <tt>Normalizer's</tt> position to be just before the
311 * the input text corresponding to that normalized character.
312 */
313UChar32 Normalizer::last() {
729e4ab9 314 currentIndex=nextIndex=text->setToEnd();
b75a7d8f
A
315 clearBuffer();
316 return previous();
317}
318
319/**
320 * Retrieve the current iteration position in the input text that is
321 * being normalized. This method is useful in applications such as
322 * searching, where you need to be able to determine the position in
323 * the input text that corresponds to a given normalized output character.
324 * <p>
325 * <b>Note:</b> This method sets the position in the <em>input</em>, while
326 * {@link #next} and {@link #previous} iterate through characters in the
327 * <em>output</em>. This means that there is not necessarily a one-to-one
328 * correspondence between characters returned by <tt>next</tt> and
329 * <tt>previous</tt> and the indices passed to and returned from
330 * <tt>setIndex</tt> and {@link #getIndex}.
331 *
332 */
333int32_t Normalizer::getIndex() const {
334 if(bufferPos<buffer.length()) {
335 return currentIndex;
336 } else {
337 return nextIndex;
338 }
339}
340
341/**
729e4ab9 342 * Retrieve the index of the start of the input text. This is the begin index
b75a7d8f
A
343 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
344 * over which this <tt>Normalizer</tt> is iterating
345 */
346int32_t Normalizer::startIndex() const {
729e4ab9 347 return text->startIndex();
b75a7d8f
A
348}
349
350/**
729e4ab9 351 * Retrieve the index of the end of the input text. This is the end index
b75a7d8f
A
352 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
353 * over which this <tt>Normalizer</tt> is iterating
354 */
355int32_t Normalizer::endIndex() const {
729e4ab9 356 return text->endIndex();
b75a7d8f
A
357}
358
359//-------------------------------------------------------------------------
360// Property access methods
361//-------------------------------------------------------------------------
362
363void
364Normalizer::setMode(UNormalizationMode newMode)
365{
366 fUMode = newMode;
729e4ab9 367 init();
b75a7d8f
A
368}
369
370UNormalizationMode
371Normalizer::getUMode() const
372{
373 return fUMode;
374}
375
376void
377Normalizer::setOption(int32_t option,
378 UBool value)
379{
380 if (value) {
381 fOptions |= option;
382 } else {
383 fOptions &= (~option);
384 }
729e4ab9 385 init();
b75a7d8f
A
386}
387
388UBool
389Normalizer::getOption(int32_t option) const
390{
391 return (fOptions & option) != 0;
392}
393
394/**
395 * Set the input text over which this <tt>Normalizer</tt> will iterate.
729e4ab9 396 * The iteration position is set to the beginning of the input text.
b75a7d8f
A
397 */
398void
399Normalizer::setText(const UnicodeString& newText,
400 UErrorCode &status)
401{
402 if (U_FAILURE(status)) {
403 return;
404 }
405 CharacterIterator *newIter = new StringCharacterIterator(newText);
406 if (newIter == NULL) {
407 status = U_MEMORY_ALLOCATION_ERROR;
408 return;
409 }
729e4ab9
A
410 delete text;
411 text = newIter;
b75a7d8f
A
412 reset();
413}
414
415/**
416 * Set the input text over which this <tt>Normalizer</tt> will iterate.
417 * The iteration position is set to the beginning of the string.
418 */
419void
420Normalizer::setText(const CharacterIterator& newText,
421 UErrorCode &status)
422{
423 if (U_FAILURE(status)) {
424 return;
425 }
426 CharacterIterator *newIter = newText.clone();
427 if (newIter == NULL) {
428 status = U_MEMORY_ALLOCATION_ERROR;
429 return;
430 }
729e4ab9
A
431 delete text;
432 text = newIter;
b75a7d8f
A
433 reset();
434}
435
436void
437Normalizer::setText(const UChar* newText,
438 int32_t length,
439 UErrorCode &status)
440{
441 if (U_FAILURE(status)) {
442 return;
443 }
444 CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
445 if (newIter == NULL) {
446 status = U_MEMORY_ALLOCATION_ERROR;
447 return;
448 }
729e4ab9
A
449 delete text;
450 text = newIter;
b75a7d8f
A
451 reset();
452}
453
454/**
455 * Copies the text under iteration into the UnicodeString referred to by "result".
456 * @param result Receives a copy of the text under iteration.
457 */
458void
459Normalizer::getText(UnicodeString& result)
460{
729e4ab9 461 text->getText(result);
b75a7d8f
A
462}
463
464//-------------------------------------------------------------------------
465// Private utility methods
466//-------------------------------------------------------------------------
467
468void Normalizer::clearBuffer() {
469 buffer.remove();
470 bufferPos=0;
471}
472
473UBool
474Normalizer::nextNormalize() {
b75a7d8f
A
475 clearBuffer();
476 currentIndex=nextIndex;
729e4ab9
A
477 text->setIndex(nextIndex);
478 if(!text->hasNext()) {
b75a7d8f
A
479 return FALSE;
480 }
729e4ab9
A
481 // Skip at least one character so we make progress.
482 UnicodeString segment(text->next32PostInc());
483 while(text->hasNext()) {
484 UChar32 c;
485 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
486 text->move32(-1, CharacterIterator::kCurrent);
487 break;
488 }
489 segment.append(c);
b75a7d8f 490 }
729e4ab9
A
491 nextIndex=text->getIndex();
492 UErrorCode errorCode=U_ZERO_ERROR;
493 fNorm2->normalize(segment, buffer, errorCode);
b75a7d8f
A
494 return U_SUCCESS(errorCode) && !buffer.isEmpty();
495}
496
497UBool
498Normalizer::previousNormalize() {
b75a7d8f
A
499 clearBuffer();
500 nextIndex=currentIndex;
729e4ab9
A
501 text->setIndex(currentIndex);
502 if(!text->hasPrevious()) {
b75a7d8f
A
503 return FALSE;
504 }
729e4ab9
A
505 UnicodeString segment;
506 while(text->hasPrevious()) {
507 UChar32 c=text->previous32();
508 segment.insert(0, c);
509 if(fNorm2->hasBoundaryBefore(c)) {
510 break;
511 }
b75a7d8f 512 }
729e4ab9
A
513 currentIndex=text->getIndex();
514 UErrorCode errorCode=U_ZERO_ERROR;
515 fNorm2->normalize(segment, buffer, errorCode);
b75a7d8f 516 bufferPos=buffer.length();
b75a7d8f
A
517 return U_SUCCESS(errorCode) && !buffer.isEmpty();
518}
519
520U_NAMESPACE_END
521
522#endif /* #if !UCONFIG_NO_NORMALIZATION */