]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/normlzr.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / common / normlzr.cpp
CommitLineData
b75a7d8f
A
1/*
2 *************************************************************************
3 * COPYRIGHT:
73c04bcf 4 * Copyright (c) 1996-2005, International Business Machines Corporation and
b75a7d8f
A
5 * others. All Rights Reserved.
6 *************************************************************************
7 */
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_NORMALIZATION
12
13#include "unicode/unistr.h"
14#include "unicode/chariter.h"
15#include "unicode/schriter.h"
16#include "unicode/uchriter.h"
17#include "unicode/uiter.h"
18#include "unicode/normlzr.h"
19#include "cmemory.h"
20#include "unormimp.h"
21
22U_NAMESPACE_BEGIN
23
374ca955 24UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
b75a7d8f
A
25
26//-------------------------------------------------------------------------
27// Constructors and other boilerplate
28//-------------------------------------------------------------------------
29
30Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
31 UObject(), fUMode(mode), fOptions(0),
32 currentIndex(0), nextIndex(0),
33 buffer(), bufferPos(0)
34{
35 init(new StringCharacterIterator(str));
36}
37
38Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
39 UObject(), fUMode(mode), fOptions(0),
40 currentIndex(0), nextIndex(0),
41 buffer(), bufferPos(0)
42{
43 init(new UCharCharacterIterator(str, length));
44}
45
46Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
47 UObject(), fUMode(mode), fOptions(0),
48 currentIndex(0), nextIndex(0),
49 buffer(), bufferPos(0)
50{
51 init(iter.clone());
52}
53
54Normalizer::Normalizer(const Normalizer &copy) :
55 UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions),
56 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
57 buffer(copy.buffer), bufferPos(copy.bufferPos)
58{
59 init(((CharacterIterator *)(copy.text->context))->clone());
60}
61
62static const UChar _NUL=0;
63
64void
65Normalizer::init(CharacterIterator *iter) {
66 UErrorCode errorCode=U_ZERO_ERROR;
67
68 text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator));
69 if(text!=NULL) {
70 if(unorm_haveData(&errorCode)) {
71 uiter_setCharacterIterator(text, iter);
72 } else {
73 delete iter;
74 uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0));
75 }
76 } else {
77 delete iter;
78 }
79}
80
81Normalizer::~Normalizer()
82{
83 if(text!=NULL) {
84 delete (CharacterIterator *)text->context;
85 uprv_free(text);
86 }
87}
88
89Normalizer*
90Normalizer::clone() const
91{
92 if(this!=0) {
93 return new Normalizer(*this);
94 } else {
95 return 0;
96 }
97}
98
99/**
100 * Generates a hash code for this iterator.
101 */
102int32_t Normalizer::hashCode() const
103{
104 return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
105}
106
107UBool Normalizer::operator==(const Normalizer& that) const
108{
109 return
110 this==&that ||
111 fUMode==that.fUMode &&
112 fOptions==that.fOptions &&
113 *((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) &&
114 buffer==that.buffer &&
115 bufferPos==that.bufferPos &&
116 nextIndex==that.nextIndex;
117}
118
119//-------------------------------------------------------------------------
120// Static utility methods
121//-------------------------------------------------------------------------
122
374ca955 123void U_EXPORT2
b75a7d8f
A
124Normalizer::normalize(const UnicodeString& source,
125 UNormalizationMode mode, int32_t options,
126 UnicodeString& result,
127 UErrorCode &status) {
128 if(source.isBogus() || U_FAILURE(status)) {
129 result.setToBogus();
130 if(U_SUCCESS(status)) {
131 status=U_ILLEGAL_ARGUMENT_ERROR;
132 }
133 } else {
134 UnicodeString localDest;
135 UnicodeString *dest;
136
137 if(&source!=&result) {
138 dest=&result;
139 } else {
140 // the source and result strings are the same object, use a temporary one
141 dest=&localDest;
142 }
143
144 UChar *buffer=dest->getBuffer(source.length());
145 int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(),
146 source.getBuffer(), source.length(),
147 mode, options,
148 &status);
73c04bcf 149 dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
b75a7d8f
A
150 if(status==U_BUFFER_OVERFLOW_ERROR) {
151 status=U_ZERO_ERROR;
152 buffer=dest->getBuffer(length);
153 length=unorm_internalNormalize(buffer, dest->getCapacity(),
154 source.getBuffer(), source.length(),
155 mode, options,
156 &status);
73c04bcf 157 dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
b75a7d8f
A
158 }
159
160 if(dest==&localDest) {
161 result=*dest;
162 }
163 if(U_FAILURE(status)) {
164 result.setToBogus();
165 }
166 }
167}
168
374ca955 169void U_EXPORT2
b75a7d8f
A
170Normalizer::compose(const UnicodeString& source,
171 UBool compat, int32_t options,
172 UnicodeString& result,
173 UErrorCode &status) {
174 if(source.isBogus() || U_FAILURE(status)) {
175 result.setToBogus();
176 if(U_SUCCESS(status)) {
177 status=U_ILLEGAL_ARGUMENT_ERROR;
178 }
179 } else {
180 UnicodeString localDest;
181 UnicodeString *dest;
182
183 if(&source!=&result) {
184 dest=&result;
185 } else {
186 // the source and result strings are the same object, use a temporary one
187 dest=&localDest;
188 }
189
190 UChar *buffer=dest->getBuffer(source.length());
191 int32_t length=unorm_compose(buffer, dest->getCapacity(),
192 source.getBuffer(), source.length(),
193 compat, options,
194 &status);
73c04bcf 195 dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
b75a7d8f
A
196 if(status==U_BUFFER_OVERFLOW_ERROR) {
197 status=U_ZERO_ERROR;
198 buffer=dest->getBuffer(length);
199 length=unorm_compose(buffer, dest->getCapacity(),
200 source.getBuffer(), source.length(),
201 compat, options,
202 &status);
73c04bcf 203 dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
b75a7d8f
A
204 }
205
206 if(dest==&localDest) {
207 result=*dest;
208 }
209 if(U_FAILURE(status)) {
210 result.setToBogus();
211 }
212 }
213}
214
374ca955 215void U_EXPORT2
b75a7d8f
A
216Normalizer::decompose(const UnicodeString& source,
217 UBool compat, int32_t options,
218 UnicodeString& result,
219 UErrorCode &status) {
220 if(source.isBogus() || U_FAILURE(status)) {
221 result.setToBogus();
222 if(U_SUCCESS(status)) {
223 status=U_ILLEGAL_ARGUMENT_ERROR;
224 }
225 } else {
226 UnicodeString localDest;
227 UnicodeString *dest;
228
229 if(&source!=&result) {
230 dest=&result;
231 } else {
232 // the source and result strings are the same object, use a temporary one
233 dest=&localDest;
234 }
235
236 UChar *buffer=dest->getBuffer(source.length());
237 int32_t length=unorm_decompose(buffer, dest->getCapacity(),
238 source.getBuffer(), source.length(),
239 compat, options,
240 &status);
73c04bcf 241 dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
b75a7d8f
A
242 if(status==U_BUFFER_OVERFLOW_ERROR) {
243 status=U_ZERO_ERROR;
244 buffer=dest->getBuffer(length);
245 length=unorm_decompose(buffer, dest->getCapacity(),
246 source.getBuffer(), source.length(),
247 compat, options,
248 &status);
73c04bcf 249 dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
b75a7d8f
A
250 }
251
252 if(dest==&localDest) {
253 result=*dest;
254 }
255 if(U_FAILURE(status)) {
256 result.setToBogus();
257 }
258 }
259}
260
374ca955 261UnicodeString & U_EXPORT2
b75a7d8f
A
262Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
263 UnicodeString &result,
264 UNormalizationMode mode, int32_t options,
265 UErrorCode &errorCode) {
266 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
267 result.setToBogus();
268 if(U_SUCCESS(errorCode)) {
269 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
270 }
271 } else {
272 UnicodeString localDest;
273 UnicodeString *dest;
274
275 if(&left!=&result && &right!=&result) {
276 dest=&result;
277 } else {
278 // the source and result strings are the same object, use a temporary one
279 dest=&localDest;
280 }
281
282 UChar *buffer=dest->getBuffer(left.length()+right.length());
283 int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
284 right.getBuffer(), right.length(),
285 buffer, dest->getCapacity(),
286 mode, options,
287 &errorCode);
73c04bcf 288 dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f
A
289 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
290 errorCode=U_ZERO_ERROR;
291 buffer=dest->getBuffer(length);
292 int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
293 right.getBuffer(), right.length(),
294 buffer, dest->getCapacity(),
295 mode, options,
296 &errorCode);
73c04bcf 297 dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f
A
298 }
299
300 if(dest==&localDest) {
301 result=*dest;
302 }
303 if(U_FAILURE(errorCode)) {
304 result.setToBogus();
305 }
306 }
307 return result;
308}
309
310//-------------------------------------------------------------------------
311// Iteration API
312//-------------------------------------------------------------------------
313
314/**
315 * Return the current character in the normalized text.
316 */
317UChar32 Normalizer::current() {
318 if(bufferPos<buffer.length() || nextNormalize()) {
319 return buffer.char32At(bufferPos);
320 } else {
321 return DONE;
322 }
323}
324
325/**
326 * Return the next character in the normalized text and advance
327 * the iteration position by one. If the end
328 * of the text has already been reached, {@link #DONE} is returned.
329 */
330UChar32 Normalizer::next() {
331 if(bufferPos<buffer.length() || nextNormalize()) {
332 UChar32 c=buffer.char32At(bufferPos);
333 bufferPos+=UTF_CHAR_LENGTH(c);
334 return c;
335 } else {
336 return DONE;
337 }
338}
339
340/**
341 * Return the previous character in the normalized text and decrement
342 * the iteration position by one. If the beginning
343 * of the text has already been reached, {@link #DONE} is returned.
344 */
345UChar32 Normalizer::previous() {
346 if(bufferPos>0 || previousNormalize()) {
347 UChar32 c=buffer.char32At(bufferPos-1);
348 bufferPos-=UTF_CHAR_LENGTH(c);
349 return c;
350 } else {
351 return DONE;
352 }
353}
354
355void Normalizer::reset() {
356 currentIndex=nextIndex=text->move(text, 0, UITER_START);
357 clearBuffer();
358}
359
360void
361Normalizer::setIndexOnly(int32_t index) {
362 currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index
363 clearBuffer();
364}
365
366/**
367 * Return the first character in the normalized text-> This resets
368 * the <tt>Normalizer's</tt> position to the beginning of the text->
369 */
370UChar32 Normalizer::first() {
371 reset();
372 return next();
373}
374
375/**
376 * Return the last character in the normalized text-> This resets
377 * the <tt>Normalizer's</tt> position to be just before the
378 * the input text corresponding to that normalized character.
379 */
380UChar32 Normalizer::last() {
381 currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT);
382 clearBuffer();
383 return previous();
384}
385
386/**
387 * Retrieve the current iteration position in the input text that is
388 * being normalized. This method is useful in applications such as
389 * searching, where you need to be able to determine the position in
390 * the input text that corresponds to a given normalized output character.
391 * <p>
392 * <b>Note:</b> This method sets the position in the <em>input</em>, while
393 * {@link #next} and {@link #previous} iterate through characters in the
394 * <em>output</em>. This means that there is not necessarily a one-to-one
395 * correspondence between characters returned by <tt>next</tt> and
396 * <tt>previous</tt> and the indices passed to and returned from
397 * <tt>setIndex</tt> and {@link #getIndex}.
398 *
399 */
400int32_t Normalizer::getIndex() const {
401 if(bufferPos<buffer.length()) {
402 return currentIndex;
403 } else {
404 return nextIndex;
405 }
406}
407
408/**
409 * Retrieve the index of the start of the input text-> This is the begin index
410 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
411 * over which this <tt>Normalizer</tt> is iterating
412 */
413int32_t Normalizer::startIndex() const {
414 return text->getIndex(text, UITER_START);
415}
416
417/**
418 * Retrieve the index of the end of the input text-> This is the end index
419 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
420 * over which this <tt>Normalizer</tt> is iterating
421 */
422int32_t Normalizer::endIndex() const {
423 return text->getIndex(text, UITER_LIMIT);
424}
425
426//-------------------------------------------------------------------------
427// Property access methods
428//-------------------------------------------------------------------------
429
430void
431Normalizer::setMode(UNormalizationMode newMode)
432{
433 fUMode = newMode;
434}
435
436UNormalizationMode
437Normalizer::getUMode() const
438{
439 return fUMode;
440}
441
442void
443Normalizer::setOption(int32_t option,
444 UBool value)
445{
446 if (value) {
447 fOptions |= option;
448 } else {
449 fOptions &= (~option);
450 }
451}
452
453UBool
454Normalizer::getOption(int32_t option) const
455{
456 return (fOptions & option) != 0;
457}
458
459/**
460 * Set the input text over which this <tt>Normalizer</tt> will iterate.
461 * The iteration position is set to the beginning of the input text->
462 */
463void
464Normalizer::setText(const UnicodeString& newText,
465 UErrorCode &status)
466{
467 if (U_FAILURE(status)) {
468 return;
469 }
470 CharacterIterator *newIter = new StringCharacterIterator(newText);
471 if (newIter == NULL) {
472 status = U_MEMORY_ALLOCATION_ERROR;
473 return;
474 }
475 delete (CharacterIterator *)(text->context);
476 text->context = newIter;
477 reset();
478}
479
480/**
481 * Set the input text over which this <tt>Normalizer</tt> will iterate.
482 * The iteration position is set to the beginning of the string.
483 */
484void
485Normalizer::setText(const CharacterIterator& newText,
486 UErrorCode &status)
487{
488 if (U_FAILURE(status)) {
489 return;
490 }
491 CharacterIterator *newIter = newText.clone();
492 if (newIter == NULL) {
493 status = U_MEMORY_ALLOCATION_ERROR;
494 return;
495 }
496 delete (CharacterIterator *)(text->context);
497 text->context = newIter;
498 reset();
499}
500
501void
502Normalizer::setText(const UChar* newText,
503 int32_t length,
504 UErrorCode &status)
505{
506 if (U_FAILURE(status)) {
507 return;
508 }
509 CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
510 if (newIter == NULL) {
511 status = U_MEMORY_ALLOCATION_ERROR;
512 return;
513 }
514 delete (CharacterIterator *)(text->context);
515 text->context = newIter;
516 reset();
517}
518
519/**
520 * Copies the text under iteration into the UnicodeString referred to by "result".
521 * @param result Receives a copy of the text under iteration.
522 */
523void
524Normalizer::getText(UnicodeString& result)
525{
526 ((CharacterIterator *)(text->context))->getText(result);
527}
528
529//-------------------------------------------------------------------------
530// Private utility methods
531//-------------------------------------------------------------------------
532
533void Normalizer::clearBuffer() {
534 buffer.remove();
535 bufferPos=0;
536}
537
538UBool
539Normalizer::nextNormalize() {
540 UChar *p;
541 int32_t length;
542 UErrorCode errorCode;
543
544 clearBuffer();
545 currentIndex=nextIndex;
546 text->move(text, nextIndex, UITER_ZERO);
547 if(!text->hasNext(text)) {
548 return FALSE;
549 }
550
551 errorCode=U_ZERO_ERROR;
552 p=buffer.getBuffer(-1);
553 length=unorm_next(text, p, buffer.getCapacity(),
554 fUMode, fOptions,
555 TRUE, 0,
556 &errorCode);
73c04bcf 557 buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f
A
558 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
559 errorCode=U_ZERO_ERROR;
560 text->move(text, nextIndex, UITER_ZERO);
561 p=buffer.getBuffer(length);
562 length=unorm_next(text, p, buffer.getCapacity(),
563 fUMode, fOptions,
564 TRUE, 0,
565 &errorCode);
73c04bcf 566 buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f
A
567 }
568
569 nextIndex=text->getIndex(text, UITER_CURRENT);
570 return U_SUCCESS(errorCode) && !buffer.isEmpty();
571}
572
573UBool
574Normalizer::previousNormalize() {
575 UChar *p;
576 int32_t length;
577 UErrorCode errorCode;
578
579 clearBuffer();
580 nextIndex=currentIndex;
581 text->move(text, currentIndex, UITER_ZERO);
582 if(!text->hasPrevious(text)) {
583 return FALSE;
584 }
585
586 errorCode=U_ZERO_ERROR;
587 p=buffer.getBuffer(-1);
588 length=unorm_previous(text, p, buffer.getCapacity(),
589 fUMode, fOptions,
590 TRUE, 0,
591 &errorCode);
73c04bcf 592 buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f
A
593 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
594 errorCode=U_ZERO_ERROR;
595 text->move(text, currentIndex, UITER_ZERO);
596 p=buffer.getBuffer(length);
597 length=unorm_previous(text, p, buffer.getCapacity(),
598 fUMode, fOptions,
599 TRUE, 0,
600 &errorCode);
73c04bcf 601 buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
b75a7d8f
A
602 }
603
604 bufferPos=buffer.length();
605 currentIndex=text->getIndex(text, UITER_CURRENT);
606 return U_SUCCESS(errorCode) && !buffer.isEmpty();
607}
608
609U_NAMESPACE_END
610
611#endif /* #if !UCONFIG_NO_NORMALIZATION */