]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/normlzr.cpp
ICU-511.25.tar.gz
[apple/icu.git] / icuSources / common / normlzr.cpp
CommitLineData
b75a7d8f
A
1/*
2 *************************************************************************
3 * COPYRIGHT:
51004dcb 4 * Copyright (c) 1996-2012, International Business Machines Corporation and
b75a7d8f
A
5 * others. All Rights Reserved.
6 *************************************************************************
7 */
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_NORMALIZATION
12
729e4ab9 13#include "unicode/uniset.h"
b75a7d8f
A
14#include "unicode/unistr.h"
15#include "unicode/chariter.h"
16#include "unicode/schriter.h"
17#include "unicode/uchriter.h"
b75a7d8f 18#include "unicode/normlzr.h"
4388f060 19#include "unicode/utf16.h"
b75a7d8f 20#include "cmemory.h"
729e4ab9
A
21#include "normalizer2impl.h"
22#include "uprops.h" // for uniset_getUnicode32Instance()
b75a7d8f
A
23
24U_NAMESPACE_BEGIN
25
374ca955 26UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
b75a7d8f
A
27
28//-------------------------------------------------------------------------
29// Constructors and other boilerplate
30//-------------------------------------------------------------------------
31
32Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
729e4ab9
A
33 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
34 text(new StringCharacterIterator(str)),
b75a7d8f
A
35 currentIndex(0), nextIndex(0),
36 buffer(), bufferPos(0)
37{
729e4ab9 38 init();
b75a7d8f
A
39}
40
41Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
729e4ab9
A
42 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
43 text(new UCharCharacterIterator(str, length)),
b75a7d8f
A
44 currentIndex(0), nextIndex(0),
45 buffer(), bufferPos(0)
46{
729e4ab9 47 init();
b75a7d8f
A
48}
49
50Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
729e4ab9
A
51 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
52 text(iter.clone()),
b75a7d8f
A
53 currentIndex(0), nextIndex(0),
54 buffer(), bufferPos(0)
55{
729e4ab9 56 init();
b75a7d8f
A
57}
58
59Normalizer::Normalizer(const Normalizer &copy) :
729e4ab9
A
60 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
61 text(copy.text->clone()),
b75a7d8f
A
62 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
63 buffer(copy.buffer), bufferPos(copy.bufferPos)
64{
729e4ab9 65 init();
b75a7d8f
A
66}
67
b75a7d8f 68void
729e4ab9 69Normalizer::init() {
b75a7d8f 70 UErrorCode errorCode=U_ZERO_ERROR;
729e4ab9
A
71 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
72 if(fOptions&UNORM_UNICODE_3_2) {
73 delete fFilteredNorm2;
74 fNorm2=fFilteredNorm2=
75 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
76 }
77 if(U_FAILURE(errorCode)) {
78 errorCode=U_ZERO_ERROR;
79 fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
b75a7d8f
A
80 }
81}
82
83Normalizer::~Normalizer()
84{
729e4ab9
A
85 delete fFilteredNorm2;
86 delete text;
b75a7d8f
A
87}
88
89Normalizer*
90Normalizer::clone() const
91{
729e4ab9 92 return new Normalizer(*this);
b75a7d8f
A
93}
94
95/**
96 * Generates a hash code for this iterator.
97 */
98int32_t Normalizer::hashCode() const
99{
729e4ab9 100 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
b75a7d8f
A
101}
102
103UBool Normalizer::operator==(const Normalizer& that) const
104{
105 return
106 this==&that ||
729e4ab9 107 (fUMode==that.fUMode &&
b75a7d8f 108 fOptions==that.fOptions &&
729e4ab9 109 *text==*that.text &&
b75a7d8f
A
110 buffer==that.buffer &&
111 bufferPos==that.bufferPos &&
729e4ab9 112 nextIndex==that.nextIndex);
b75a7d8f
A
113}
114
115//-------------------------------------------------------------------------
116// Static utility methods
117//-------------------------------------------------------------------------
118
374ca955 119void U_EXPORT2
b75a7d8f
A
120Normalizer::normalize(const UnicodeString& source,
121 UNormalizationMode mode, int32_t options,
122 UnicodeString& result,
123 UErrorCode &status) {
124 if(source.isBogus() || U_FAILURE(status)) {
125 result.setToBogus();
126 if(U_SUCCESS(status)) {
127 status=U_ILLEGAL_ARGUMENT_ERROR;
128 }
129 } else {
130 UnicodeString localDest;
131 UnicodeString *dest;
132
133 if(&source!=&result) {
134 dest=&result;
135 } else {
136 // the source and result strings are the same object, use a temporary one
137 dest=&localDest;
138 }
729e4ab9
A
139 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
140 if(U_SUCCESS(status)) {
141 if(options&UNORM_UNICODE_3_2) {
142 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
143 normalize(source, *dest, status);
144 } else {
145 n2->normalize(source, *dest, status);
146 }
b75a7d8f 147 }
729e4ab9 148 if(dest==&localDest && U_SUCCESS(status)) {
b75a7d8f
A
149 result=*dest;
150 }
b75a7d8f
A
151 }
152}
153
374ca955 154void U_EXPORT2
b75a7d8f
A
155Normalizer::compose(const UnicodeString& source,
156 UBool compat, int32_t options,
157 UnicodeString& result,
158 UErrorCode &status) {
729e4ab9 159 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
b75a7d8f
A
160}
161
374ca955 162void U_EXPORT2
b75a7d8f
A
163Normalizer::decompose(const UnicodeString& source,
164 UBool compat, int32_t options,
165 UnicodeString& result,
166 UErrorCode &status) {
729e4ab9
A
167 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
168}
169
170UNormalizationCheckResult
171Normalizer::quickCheck(const UnicodeString& source,
172 UNormalizationMode mode, int32_t options,
173 UErrorCode &status) {
174 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
175 if(U_SUCCESS(status)) {
176 if(options&UNORM_UNICODE_3_2) {
177 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
178 quickCheck(source, status);
179 } else {
180 return n2->quickCheck(source, status);
b75a7d8f
A
181 }
182 } else {
729e4ab9
A
183 return UNORM_MAYBE;
184 }
185}
b75a7d8f 186
729e4ab9
A
187UBool
188Normalizer::isNormalized(const UnicodeString& source,
189 UNormalizationMode mode, int32_t options,
190 UErrorCode &status) {
191 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
192 if(U_SUCCESS(status)) {
193 if(options&UNORM_UNICODE_3_2) {
194 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
195 isNormalized(source, status);
b75a7d8f 196 } else {
729e4ab9 197 return n2->isNormalized(source, status);
b75a7d8f 198 }
729e4ab9
A
199 } else {
200 return FALSE;
b75a7d8f
A
201 }
202}
203
374ca955 204UnicodeString & U_EXPORT2
4388f060 205Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
b75a7d8f
A
206 UnicodeString &result,
207 UNormalizationMode mode, int32_t options,
208 UErrorCode &errorCode) {
209 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
210 result.setToBogus();
211 if(U_SUCCESS(errorCode)) {
212 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
213 }
214 } else {
215 UnicodeString localDest;
216 UnicodeString *dest;
217
729e4ab9 218 if(&right!=&result) {
b75a7d8f
A
219 dest=&result;
220 } else {
729e4ab9 221 // the right and result strings are the same object, use a temporary one
b75a7d8f
A
222 dest=&localDest;
223 }
729e4ab9
A
224 *dest=left;
225 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
226 if(U_SUCCESS(errorCode)) {
227 if(options&UNORM_UNICODE_3_2) {
228 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
229 append(*dest, right, errorCode);
230 } else {
231 n2->append(*dest, right, errorCode);
232 }
b75a7d8f 233 }
729e4ab9 234 if(dest==&localDest && U_SUCCESS(errorCode)) {
b75a7d8f
A
235 result=*dest;
236 }
b75a7d8f
A
237 }
238 return result;
239}
240
241//-------------------------------------------------------------------------
242// Iteration API
243//-------------------------------------------------------------------------
244
245/**
246 * Return the current character in the normalized text.
247 */
248UChar32 Normalizer::current() {
249 if(bufferPos<buffer.length() || nextNormalize()) {
250 return buffer.char32At(bufferPos);
251 } else {
252 return DONE;
253 }
254}
255
256/**
257 * Return the next character in the normalized text and advance
258 * the iteration position by one. If the end
259 * of the text has already been reached, {@link #DONE} is returned.
260 */
261UChar32 Normalizer::next() {
262 if(bufferPos<buffer.length() || nextNormalize()) {
263 UChar32 c=buffer.char32At(bufferPos);
4388f060 264 bufferPos+=U16_LENGTH(c);
b75a7d8f
A
265 return c;
266 } else {
267 return DONE;
268 }
269}
270
271/**
272 * Return the previous character in the normalized text and decrement
273 * the iteration position by one. If the beginning
274 * of the text has already been reached, {@link #DONE} is returned.
275 */
276UChar32 Normalizer::previous() {
277 if(bufferPos>0 || previousNormalize()) {
278 UChar32 c=buffer.char32At(bufferPos-1);
4388f060 279 bufferPos-=U16_LENGTH(c);
b75a7d8f
A
280 return c;
281 } else {
282 return DONE;
283 }
284}
285
286void Normalizer::reset() {
729e4ab9 287 currentIndex=nextIndex=text->setToStart();
b75a7d8f
A
288 clearBuffer();
289}
290
291void
292Normalizer::setIndexOnly(int32_t index) {
729e4ab9
A
293 text->setIndex(index); // pins index
294 currentIndex=nextIndex=text->getIndex();
b75a7d8f
A
295 clearBuffer();
296}
297
298/**
729e4ab9
A
299 * Return the first character in the normalized text. This resets
300 * the <tt>Normalizer's</tt> position to the beginning of the text.
b75a7d8f
A
301 */
302UChar32 Normalizer::first() {
303 reset();
304 return next();
305}
306
307/**
729e4ab9 308 * Return the last character in the normalized text. This resets
b75a7d8f
A
309 * the <tt>Normalizer's</tt> position to be just before the
310 * the input text corresponding to that normalized character.
311 */
312UChar32 Normalizer::last() {
729e4ab9 313 currentIndex=nextIndex=text->setToEnd();
b75a7d8f
A
314 clearBuffer();
315 return previous();
316}
317
318/**
319 * Retrieve the current iteration position in the input text that is
320 * being normalized. This method is useful in applications such as
321 * searching, where you need to be able to determine the position in
322 * the input text that corresponds to a given normalized output character.
323 * <p>
324 * <b>Note:</b> This method sets the position in the <em>input</em>, while
325 * {@link #next} and {@link #previous} iterate through characters in the
326 * <em>output</em>. This means that there is not necessarily a one-to-one
327 * correspondence between characters returned by <tt>next</tt> and
328 * <tt>previous</tt> and the indices passed to and returned from
329 * <tt>setIndex</tt> and {@link #getIndex}.
330 *
331 */
332int32_t Normalizer::getIndex() const {
333 if(bufferPos<buffer.length()) {
334 return currentIndex;
335 } else {
336 return nextIndex;
337 }
338}
339
340/**
729e4ab9 341 * Retrieve the index of the start of the input text. This is the begin index
b75a7d8f
A
342 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
343 * over which this <tt>Normalizer</tt> is iterating
344 */
345int32_t Normalizer::startIndex() const {
729e4ab9 346 return text->startIndex();
b75a7d8f
A
347}
348
349/**
729e4ab9 350 * Retrieve the index of the end of the input text. This is the end index
b75a7d8f
A
351 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
352 * over which this <tt>Normalizer</tt> is iterating
353 */
354int32_t Normalizer::endIndex() const {
729e4ab9 355 return text->endIndex();
b75a7d8f
A
356}
357
358//-------------------------------------------------------------------------
359// Property access methods
360//-------------------------------------------------------------------------
361
362void
363Normalizer::setMode(UNormalizationMode newMode)
364{
365 fUMode = newMode;
729e4ab9 366 init();
b75a7d8f
A
367}
368
369UNormalizationMode
370Normalizer::getUMode() const
371{
372 return fUMode;
373}
374
375void
376Normalizer::setOption(int32_t option,
377 UBool value)
378{
379 if (value) {
380 fOptions |= option;
381 } else {
382 fOptions &= (~option);
383 }
729e4ab9 384 init();
b75a7d8f
A
385}
386
387UBool
388Normalizer::getOption(int32_t option) const
389{
390 return (fOptions & option) != 0;
391}
392
393/**
394 * Set the input text over which this <tt>Normalizer</tt> will iterate.
729e4ab9 395 * The iteration position is set to the beginning of the input text.
b75a7d8f
A
396 */
397void
398Normalizer::setText(const UnicodeString& newText,
399 UErrorCode &status)
400{
401 if (U_FAILURE(status)) {
402 return;
403 }
404 CharacterIterator *newIter = new StringCharacterIterator(newText);
405 if (newIter == NULL) {
406 status = U_MEMORY_ALLOCATION_ERROR;
407 return;
408 }
729e4ab9
A
409 delete text;
410 text = newIter;
b75a7d8f
A
411 reset();
412}
413
414/**
415 * Set the input text over which this <tt>Normalizer</tt> will iterate.
416 * The iteration position is set to the beginning of the string.
417 */
418void
419Normalizer::setText(const CharacterIterator& newText,
420 UErrorCode &status)
421{
422 if (U_FAILURE(status)) {
423 return;
424 }
425 CharacterIterator *newIter = newText.clone();
426 if (newIter == NULL) {
427 status = U_MEMORY_ALLOCATION_ERROR;
428 return;
429 }
729e4ab9
A
430 delete text;
431 text = newIter;
b75a7d8f
A
432 reset();
433}
434
435void
436Normalizer::setText(const UChar* newText,
437 int32_t length,
438 UErrorCode &status)
439{
440 if (U_FAILURE(status)) {
441 return;
442 }
443 CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
444 if (newIter == NULL) {
445 status = U_MEMORY_ALLOCATION_ERROR;
446 return;
447 }
729e4ab9
A
448 delete text;
449 text = newIter;
b75a7d8f
A
450 reset();
451}
452
453/**
454 * Copies the text under iteration into the UnicodeString referred to by "result".
455 * @param result Receives a copy of the text under iteration.
456 */
457void
458Normalizer::getText(UnicodeString& result)
459{
729e4ab9 460 text->getText(result);
b75a7d8f
A
461}
462
463//-------------------------------------------------------------------------
464// Private utility methods
465//-------------------------------------------------------------------------
466
467void Normalizer::clearBuffer() {
468 buffer.remove();
469 bufferPos=0;
470}
471
472UBool
473Normalizer::nextNormalize() {
b75a7d8f
A
474 clearBuffer();
475 currentIndex=nextIndex;
729e4ab9
A
476 text->setIndex(nextIndex);
477 if(!text->hasNext()) {
b75a7d8f
A
478 return FALSE;
479 }
729e4ab9
A
480 // Skip at least one character so we make progress.
481 UnicodeString segment(text->next32PostInc());
482 while(text->hasNext()) {
483 UChar32 c;
484 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
485 text->move32(-1, CharacterIterator::kCurrent);
486 break;
487 }
488 segment.append(c);
b75a7d8f 489 }
729e4ab9
A
490 nextIndex=text->getIndex();
491 UErrorCode errorCode=U_ZERO_ERROR;
492 fNorm2->normalize(segment, buffer, errorCode);
b75a7d8f
A
493 return U_SUCCESS(errorCode) && !buffer.isEmpty();
494}
495
496UBool
497Normalizer::previousNormalize() {
b75a7d8f
A
498 clearBuffer();
499 nextIndex=currentIndex;
729e4ab9
A
500 text->setIndex(currentIndex);
501 if(!text->hasPrevious()) {
b75a7d8f
A
502 return FALSE;
503 }
729e4ab9
A
504 UnicodeString segment;
505 while(text->hasPrevious()) {
506 UChar32 c=text->previous32();
507 segment.insert(0, c);
508 if(fNorm2->hasBoundaryBefore(c)) {
509 break;
510 }
b75a7d8f 511 }
729e4ab9
A
512 currentIndex=text->getIndex();
513 UErrorCode errorCode=U_ZERO_ERROR;
514 fNorm2->normalize(segment, buffer, errorCode);
b75a7d8f 515 bufferPos=buffer.length();
b75a7d8f
A
516 return U_SUCCESS(errorCode) && !buffer.isEmpty();
517}
518
519U_NAMESPACE_END
520
521#endif /* #if !UCONFIG_NO_NORMALIZATION */