]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unicode/normlzr.h
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / unicode / normlzr.h
CommitLineData
b75a7d8f
A
1/*
2 ********************************************************************
3 * COPYRIGHT:
4 * Copyright (c) 1996-2003, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************
7 */
8
9#ifndef NORMLZR_H
10#define NORMLZR_H
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_NORMALIZATION
15
16#include "unicode/uobject.h"
17#include "unicode/unistr.h"
18#include "unicode/chariter.h"
19#include "unicode/unorm.h"
20
21struct UCharIterator;
22typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
23
24U_NAMESPACE_BEGIN
25/**
26 * \brief C++ API: Unicode Normalization
27 *
28 * The Normalizer class consists of two parts:
29 * - static functions that normalize strings or test if strings are normalized
30 * - a Normalizer object is an iterator that takes any kind of text and
31 * provides iteration over its normalized form
32 *
33 * The Normalizer class is not suitable for subclassing.
34 *
35 * The static functions are basically wrappers around the C implementation,
36 * using UnicodeString instead of UChar*.
37 * For basic information about normalization forms and details about the C API
38 * please see the documentation in unorm.h.
39 *
40 * The iterator API with the Normalizer constructors and the non-static functions
41 * uses a CharacterIterator as input. It is possible to pass a string which
42 * is then internally wrapped in a CharacterIterator.
43 * The input text is not normalized all at once, but incrementally where needed
44 * (providing efficient random access).
45 * This allows to pass in a large text but spend only a small amount of time
46 * normalizing a small part of that text.
47 * However, if the entire text is normalized, then the iterator will be
48 * slower than normalizing the entire text at once and iterating over the result.
49 * A possible use of the Normalizer iterator is also to report an index into the
50 * original text that is close to where the normalized characters come from.
51 *
52 * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
53 * The earlier implementation reported the getIndex() inconsistently,
54 * and previous() could not be used after setIndex(), next(), first(), and current().
55 *
56 * Normalizer allows to start normalizing from anywhere in the input text by
57 * calling setIndexOnly(), first(), or last().
58 * Without calling any of these, the iterator will start at the beginning of the text.
59 *
60 * At any time, next() returns the next normalized code point (UChar32),
61 * with post-increment semantics (like CharacterIterator::next32PostInc()).
62 * previous() returns the previous normalized code point (UChar32),
63 * with pre-decrement semantics (like CharacterIterator::previous32()).
64 *
65 * current() returns the current code point
66 * (respectively the one at the newly set index) without moving
67 * the getIndex(). Note that if the text at the current position
68 * needs to be normalized, then these functions will do that.
69 * (This is why current() is not const.)
70 * It is more efficient to call setIndexOnly() instead, which does not
71 * normalize.
72 *
73 * getIndex() always refers to the position in the input text where the normalized
74 * code points are returned from. It does not always change with each returned
75 * code point.
76 * The code point that is returned from any of the functions
77 * corresponds to text at or after getIndex(), according to the
78 * function's iteration semantics (post-increment or pre-decrement).
79 *
80 * next() returns a code point from at or after the getIndex()
81 * from before the next() call. After the next() call, the getIndex()
82 * might have moved to where the next code point will be returned from
83 * (from a next() or current() call).
84 * This is semantically equivalent to array access with array[index++]
85 * (post-increment semantics).
86 *
87 * previous() returns a code point from at or after the getIndex()
88 * from after the previous() call.
89 * This is semantically equivalent to array access with array[--index]
90 * (pre-decrement semantics).
91 *
92 * Internally, the Normalizer iterator normalizes a small piece of text
93 * starting at the getIndex() and ending at a following "safe" index.
94 * The normalized results is stored in an internal string buffer, and
95 * the code points are iterated from there.
96 * With multiple iteration calls, this is repeated until the next piece
97 * of text needs to be normalized, and the getIndex() needs to be moved.
98 *
99 * The following "safe" index, the internal buffer, and the secondary
100 * iteration index into that buffer are not exposed on the API.
101 * This also means that it is currently not practical to return to
102 * a particular, arbitrary position in the text because one would need to
103 * know, and be able to set, in addition to the getIndex(), at least also the
104 * current index into the internal buffer.
105 * It is currently only possible to observe when getIndex() changes
106 * (with careful consideration of the iteration semantics),
107 * at which time the internal index will be 0.
108 * For example, if getIndex() is different after next() than before it,
109 * then the internal index is 0 and one can return to this getIndex()
110 * later with setIndexOnly().
111 *
112 * @author Laura Werner, Mark Davis, Markus Scherer
113 * @stable ICU 2.0
114 */
115class U_COMMON_API Normalizer : public UObject {
116public:
117 /**
118 * If DONE is returned from an iteration function that returns a code point,
119 * then there are no more normalization results available.
120 * @stable ICU 2.0
121 */
122 enum {
123 DONE=0xffff
124 };
125
126 // Constructors
127
128 /**
129 * Creates a new <code>Normalizer</code> object for iterating over the
130 * normalized form of a given string.
131 * <p>
132 * @param str The string to be normalized. The normalization
133 * will start at the beginning of the string.
134 *
135 * @param mode The normalization mode.
136 * @stable ICU 2.0
137 */
138 Normalizer(const UnicodeString& str, UNormalizationMode mode);
139
140 /**
141 * Creates a new <code>Normalizer</code> object for iterating over the
142 * normalized form of a given string.
143 * <p>
144 * @param str The string to be normalized. The normalization
145 * will start at the beginning of the string.
146 *
147 * @param length Length of the string, or -1 if NUL-terminated.
148 * @param mode The normalization mode.
149 * @stable ICU 2.0
150 */
151 Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
152
153 /**
154 * Creates a new <code>Normalizer</code> object for iterating over the
155 * normalized form of the given text.
156 * <p>
157 * @param iter The input text to be normalized. The normalization
158 * will start at the beginning of the string.
159 *
160 * @param mode The normalization mode.
161 * @stable ICU 2.0
162 */
163 Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
164
165 /**
166 * Copy constructor.
167 * @param copy The object to be copied.
168 * @stable ICU 2.0
169 */
170 Normalizer(const Normalizer& copy);
171
172 /**
173 * Destructor
174 * @stable ICU 2.0
175 */
176 ~Normalizer();
177
178
179 //-------------------------------------------------------------------------
180 // Static utility methods
181 //-------------------------------------------------------------------------
182
183 /**
184 * Normalizes a <code>UnicodeString</code> according to the specified normalization mode.
185 * This is a wrapper for unorm_normalize(), using UnicodeString's.
186 *
187 * The <code>options</code> parameter specifies which optional
188 * <code>Normalizer</code> features are to be enabled for this operation.
189 *
190 * @param source the input string to be normalized.
191 * @param mode the normalization mode
192 * @param options the optional features to be enabled (0 for no options)
193 * @param result The normalized string (on output).
194 * @param status The error code.
195 * @stable ICU 2.0
196 */
197 static void normalize(const UnicodeString& source,
198 UNormalizationMode mode, int32_t options,
199 UnicodeString& result,
200 UErrorCode &status);
201
202 /**
203 * Compose a <code>UnicodeString</code>.
204 * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.
205 * This is a wrapper for unorm_normalize(), using UnicodeString's.
206 *
207 * The <code>options</code> parameter specifies which optional
208 * <code>Normalizer</code> features are to be enabled for this operation.
209 *
210 * @param source the string to be composed.
211 * @param compat Perform compatibility decomposition before composition.
212 * If this argument is <code>FALSE</code>, only canonical
213 * decomposition will be performed.
214 * @param options the optional features to be enabled (0 for no options)
215 * @param result The composed string (on output).
216 * @param status The error code.
217 * @stable ICU 2.0
218 */
219 static void compose(const UnicodeString& source,
220 UBool compat, int32_t options,
221 UnicodeString& result,
222 UErrorCode &status);
223
224 /**
225 * Static method to decompose a <code>UnicodeString</code>.
226 * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.
227 * This is a wrapper for unorm_normalize(), using UnicodeString's.
228 *
229 * The <code>options</code> parameter specifies which optional
230 * <code>Normalizer</code> features are to be enabled for this operation.
231 *
232 * @param source the string to be decomposed.
233 * @param compat Perform compatibility decomposition.
234 * If this argument is <code>FALSE</code>, only canonical
235 * decomposition will be performed.
236 * @param options the optional features to be enabled (0 for no options)
237 * @param result The decomposed string (on output).
238 * @param status The error code.
239 * @stable ICU 2.0
240 */
241 static void decompose(const UnicodeString& source,
242 UBool compat, int32_t options,
243 UnicodeString& result,
244 UErrorCode &status);
245
246 /**
247 * Performing quick check on a string, to quickly determine if the string is
248 * in a particular normalization format.
249 * This is a wrapper for unorm_quickCheck(), using a UnicodeString.
250 *
251 * Three types of result can be returned UNORM_YES, UNORM_NO or
252 * UNORM_MAYBE. Result UNORM_YES indicates that the argument
253 * string is in the desired normalized format, UNORM_NO determines that
254 * argument string is not in the desired normalized format. A
255 * UNORM_MAYBE result indicates that a more thorough check is required,
256 * the user may have to put the string in its normalized form and compare the
257 * results.
258 * @param source string for determining if it is in a normalized format
259 * @param mode normalization format
260 * @param status A reference to a UErrorCode to receive any errors
261 * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
262 *
263 * @see isNormalized
264 * @stable ICU 2.0
265 */
266 static inline UNormalizationCheckResult
267 quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
268
269 /**
270 * Performing quick check on a string; same as the other version of quickCheck
271 * but takes an extra options parameter like most normalization functions.
272 *
273 * @param source string for determining if it is in a normalized format
274 * @param mode normalization format
275 * @param options the optional features to be enabled (0 for no options)
276 * @param status A reference to a UErrorCode to receive any errors
277 * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
278 *
279 * @see isNormalized
280 * @draft ICU 2.6
281 */
282 static inline UNormalizationCheckResult
283 quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
284
285 /**
286 * Test if a string is in a given normalization form.
287 * This is semantically equivalent to source.equals(normalize(source, mode)) .
288 *
289 * Unlike unorm_quickCheck(), this function returns a definitive result,
290 * never a "maybe".
291 * For NFD, NFKD, and FCD, both functions work exactly the same.
292 * For NFC and NFKC where quickCheck may return "maybe", this function will
293 * perform further tests to arrive at a TRUE/FALSE result.
294 *
295 * @param src String that is to be tested if it is in a normalization format.
296 * @param mode Which normalization form to test for.
297 * @param errorCode ICU error code in/out parameter.
298 * Must fulfill U_SUCCESS before the function call.
299 * @return Boolean value indicating whether the source string is in the
300 * "mode" normalization form.
301 *
302 * @see quickCheck
303 * @draft ICU 2.2
304 */
305 static inline UBool
306 isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
307
308 /**
309 * Test if a string is in a given normalization form; same as the other version of isNormalized
310 * but takes an extra options parameter like most normalization functions.
311 *
312 * @param src String that is to be tested if it is in a normalization format.
313 * @param mode Which normalization form to test for.
314 * @param options the optional features to be enabled (0 for no options)
315 * @param errorCode ICU error code in/out parameter.
316 * Must fulfill U_SUCCESS before the function call.
317 * @return Boolean value indicating whether the source string is in the
318 * "mode" normalization form.
319 *
320 * @see quickCheck
321 * @draft ICU 2.6
322 */
323 static inline UBool
324 isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
325
326 /**
327 * Concatenate normalized strings, making sure that the result is normalized as well.
328 *
329 * If both the left and the right strings are in
330 * the normalization form according to "mode/options",
331 * then the result will be
332 *
333 * \code
334 * dest=normalize(left+right, mode, options)
335 * \endcode
336 *
337 * For details see unorm_concatenate in unorm.h.
338 *
339 * @param left Left source string.
340 * @param right Right source string.
341 * @param result The output string.
342 * @param mode The normalization mode.
343 * @param options A bit set of normalization options.
344 * @param errorCode ICU error code in/out parameter.
345 * Must fulfill U_SUCCESS before the function call.
346 * @return result
347 *
348 * @see unorm_concatenate
349 * @see normalize
350 * @see unorm_next
351 * @see unorm_previous
352 *
353 * @stable ICU 2.1
354 */
355 static UnicodeString &
356 concatenate(UnicodeString &left, UnicodeString &right,
357 UnicodeString &result,
358 UNormalizationMode mode, int32_t options,
359 UErrorCode &errorCode);
360
361 /**
362 * Compare two strings for canonical equivalence.
363 * Further options include case-insensitive comparison and
364 * code point order (as opposed to code unit order).
365 *
366 * Canonical equivalence between two strings is defined as their normalized
367 * forms (NFD or NFC) being identical.
368 * This function compares strings incrementally instead of normalizing
369 * (and optionally case-folding) both strings entirely,
370 * improving performance significantly.
371 *
372 * Bulk normalization is only necessary if the strings do not fulfill the FCD
373 * conditions. Only in this case, and only if the strings are relatively long,
374 * is memory allocated temporarily.
375 * For FCD strings and short non-FCD strings there is no memory allocation.
376 *
377 * Semantically, this is equivalent to
378 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
379 * where code point order and foldCase are all optional.
380 *
381 * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
382 * the case folding must be performed first, then the normalization.
383 *
384 * @param s1 First source string.
385 * @param s2 Second source string.
386 *
387 * @param options A bit set of options:
388 * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
389 * Case-sensitive comparison in code unit order, and the input strings
390 * are quick-checked for FCD.
391 *
392 * - UNORM_INPUT_IS_FCD
393 * Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
394 * If not set, the function will quickCheck for FCD
395 * and normalize if necessary.
396 *
397 * - U_COMPARE_CODE_POINT_ORDER
398 * Set to choose code point order instead of code unit order
399 * (see u_strCompare for details).
400 *
401 * - U_COMPARE_IGNORE_CASE
402 * Set to compare strings case-insensitively using case folding,
403 * instead of case-sensitively.
404 * If set, then the following case folding options are used.
405 *
406 * - Options as used with case-insensitive comparisons, currently:
407 *
408 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
409 * (see u_strCaseCompare for details)
410 *
411 * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
412 *
413 * @param errorCode ICU error code in/out parameter.
414 * Must fulfill U_SUCCESS before the function call.
415 * @return <0 or 0 or >0 as usual for string comparisons
416 *
417 * @see unorm_compare
418 * @see normalize
419 * @see UNORM_FCD
420 * @see u_strCompare
421 * @see u_strCaseCompare
422 *
423 * @draft ICU 2.2
424 */
425 static inline int32_t
426 compare(const UnicodeString &s1, const UnicodeString &s2,
427 uint32_t options,
428 UErrorCode &errorCode);
429
430 //-------------------------------------------------------------------------
431 // Iteration API
432 //-------------------------------------------------------------------------
433
434 /**
435 * Return the current character in the normalized text.
436 * current() may need to normalize some text at getIndex().
437 * The getIndex() is not changed.
438 *
439 * @return the current normalized code point
440 * @stable ICU 2.0
441 */
442 UChar32 current(void);
443
444 /**
445 * Return the first character in the normalized text.
446 * This is equivalent to setIndexOnly(startIndex()) followed by next().
447 * (Post-increment semantics.)
448 *
449 * @return the first normalized code point
450 * @stable ICU 2.0
451 */
452 UChar32 first(void);
453
454 /**
455 * Return the last character in the normalized text.
456 * This is equivalent to setIndexOnly(endIndex()) followed by previous().
457 * (Pre-decrement semantics.)
458 *
459 * @return the last normalized code point
460 * @stable ICU 2.0
461 */
462 UChar32 last(void);
463
464 /**
465 * Return the next character in the normalized text.
466 * (Post-increment semantics.)
467 * If the end of the text has already been reached, DONE is returned.
468 * The DONE value could be confused with a U+FFFF non-character code point
469 * in the text. If this is possible, you can test getIndex()<endIndex()
470 * before calling next(), or (getIndex()<endIndex() || last()!=DONE)
471 * after calling next(). (Calling last() will change the iterator state!)
472 *
473 * The C API unorm_next() is more efficient and does not have this ambiguity.
474 *
475 * @return the next normalized code point
476 * @stable ICU 2.0
477 */
478 UChar32 next(void);
479
480 /**
481 * Return the previous character in the normalized text and decrement.
482 * (Pre-decrement semantics.)
483 * If the beginning of the text has already been reached, DONE is returned.
484 * The DONE value could be confused with a U+FFFF non-character code point
485 * in the text. If this is possible, you can test
486 * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change
487 * the iterator state!)
488 *
489 * The C API unorm_previous() is more efficient and does not have this ambiguity.
490 *
491 * @return the previous normalized code point
492 * @stable ICU 2.0
493 */
494 UChar32 previous(void);
495
496 /**
497 * Set the iteration position in the input text that is being normalized,
498 * without any immediate normalization.
499 * After setIndexOnly(), getIndex() will return the same index that is
500 * specified here.
501 *
502 * @param index the desired index in the input text.
503 * @stable ICU 2.0
504 */
505 void setIndexOnly(int32_t index);
506
507 /**
508 * Reset the index to the beginning of the text.
509 * This is equivalent to setIndexOnly(startIndex)).
510 * @stable ICU 2.0
511 */
512 void reset(void);
513
514 /**
515 * Retrieve the current iteration position in the input text that is
516 * being normalized.
517 *
518 * A following call to next() will return a normalized code point from
519 * the input text at or after this index.
520 *
521 * After a call to previous(), getIndex() will point at or before the
522 * position in the input text where the normalized code point
523 * was returned from with previous().
524 *
525 * @return the current index in the input text
526 * @stable ICU 2.0
527 */
528 int32_t getIndex(void) const;
529
530 /**
531 * Retrieve the index of the start of the input text. This is the begin index
532 * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string
533 * over which this <code>Normalizer</code> is iterating.
534 *
535 * @return the smallest index in the input text where the Normalizer operates
536 * @stable ICU 2.0
537 */
538 int32_t startIndex(void) const;
539
540 /**
541 * Retrieve the index of the end of the input text. This is the end index
542 * of the <code>CharacterIterator</code> or the length of the string
543 * over which this <code>Normalizer</code> is iterating.
544 * This end index is exclusive, i.e., the Normalizer operates only on characters
545 * before this index.
546 *
547 * @return the first index in the input text where the Normalizer does not operate
548 * @stable ICU 2.0
549 */
550 int32_t endIndex(void) const;
551
552 /**
553 * Returns TRUE when both iterators refer to the same character in the same
554 * input text.
555 *
556 * @param that a Normalizer object to compare this one to
557 * @return comparison result
558 * @stable ICU 2.0
559 */
560 UBool operator==(const Normalizer& that) const;
561
562 /**
563 * Returns FALSE when both iterators refer to the same character in the same
564 * input text.
565 *
566 * @param that a Normalizer object to compare this one to
567 * @return comparison result
568 * @stable ICU 2.0
569 */
570 inline UBool operator!=(const Normalizer& that) const;
571
572 /**
573 * Returns a pointer to a new Normalizer that is a clone of this one.
574 * The caller is responsible for deleting the new clone.
575 * @return a pointer to a new Normalizer
576 * @stable ICU 2.0
577 */
578 Normalizer* clone(void) const;
579
580 /**
581 * Generates a hash code for this iterator.
582 *
583 * @return the hash code
584 * @stable ICU 2.0
585 */
586 int32_t hashCode(void) const;
587
588 //-------------------------------------------------------------------------
589 // Property access methods
590 //-------------------------------------------------------------------------
591
592 /**
593 * Set the normalization mode for this object.
594 * <p>
595 * <b>Note:</b>If the normalization mode is changed while iterating
596 * over a string, calls to {@link next()} and {@link previous()} may
597 * return previously buffers characters in the old normalization mode
598 * until the iteration is able to re-sync at the next base character.
599 * It is safest to call {@link setIndexOnly()}, {@link reset()},
600 * {@link setText()}, {@link first()},
601 * {@link last()}, etc. after calling <code>setMode</code>.
602 * <p>
603 * @param newMode the new mode for this <code>Normalizer</code>.
604 * @see #getUMode
605 * @stable ICU 2.0
606 */
607 void setMode(UNormalizationMode newMode);
608
609 /**
610 * Return the normalization mode for this object.
611 *
612 * This is an unusual name because there used to be a getMode() that
613 * returned a different type.
614 *
615 * @return the mode for this <code>Normalizer</code>
616 * @see #setMode
617 * @stable ICU 2.0
618 */
619 UNormalizationMode getUMode(void) const;
620
621 /**
622 * Set options that affect this <code>Normalizer</code>'s operation.
623 * Options do not change the basic composition or decomposition operation
624 * that is being performed, but they control whether
625 * certain optional portions of the operation are done.
626 * Currently the only available option is obsolete.
627 *
628 * It is possible to specify multiple options that are all turned on or off.
629 *
630 * @param option the option(s) whose value is/are to be set.
631 * @param value the new setting for the option. Use <code>TRUE</code> to
632 * turn the option(s) on and <code>FALSE</code> to turn it/them off.
633 *
634 * @see #getOption
635 * @stable ICU 2.0
636 */
637 void setOption(int32_t option,
638 UBool value);
639
640 /**
641 * Determine whether an option is turned on or off.
642 * If multiple options are specified, then the result is TRUE if any
643 * of them are set.
644 * <p>
645 * @param option the option(s) that are to be checked
646 * @return TRUE if any of the option(s) are set
647 * @see #setOption
648 * @stable ICU 2.0
649 */
650 UBool getOption(int32_t option) const;
651
652 /**
653 * Set the input text over which this <code>Normalizer</code> will iterate.
654 * The iteration position is set to the beginning.
655 *
656 * @param newText a string that replaces the current input text
657 * @param status a UErrorCode
658 * @stable ICU 2.0
659 */
660 void setText(const UnicodeString& newText,
661 UErrorCode &status);
662
663 /**
664 * Set the input text over which this <code>Normalizer</code> will iterate.
665 * The iteration position is set to the beginning.
666 *
667 * @param newText a CharacterIterator object that replaces the current input text
668 * @param status a UErrorCode
669 * @stable ICU 2.0
670 */
671 void setText(const CharacterIterator& newText,
672 UErrorCode &status);
673
674 /**
675 * Set the input text over which this <code>Normalizer</code> will iterate.
676 * The iteration position is set to the beginning.
677 *
678 * @param newText a string that replaces the current input text
679 * @param length the length of the string, or -1 if NUL-terminated
680 * @param status a UErrorCode
681 * @stable ICU 2.0
682 */
683 void setText(const UChar* newText,
684 int32_t length,
685 UErrorCode &status);
686 /**
687 * Copies the input text into the UnicodeString argument.
688 *
689 * @param result Receives a copy of the text under iteration.
690 * @stable ICU 2.0
691 */
692 void getText(UnicodeString& result);
693
694 /**
695 * ICU "poor man's RTTI", returns a UClassID for the actual class.
696 * @return a UClassID for the actual class.
697 * @draft ICU 2.2
698 */
699 virtual inline UClassID getDynamicClassID() const;
700
701 /**
702 * ICU "poor man's RTTI", returns a UClassID for this class.
703 * @returns a UClassID for this class.
704 * @draft ICU 2.2
705 */
706 static inline UClassID getStaticClassID();
707
708private:
709 //-------------------------------------------------------------------------
710 // Private functions
711 //-------------------------------------------------------------------------
712
713 Normalizer(); // default constructor not implemented
714 Normalizer &operator=(const Normalizer &that); // assignment operator not implemented
715
716 // Private utility methods for iteration
717 // For documentation, see the source code
718 UBool nextNormalize();
719 UBool previousNormalize();
720
721 void init(CharacterIterator *iter);
722 void clearBuffer(void);
723
724 //-------------------------------------------------------------------------
725 // Private data
726 //-------------------------------------------------------------------------
727
728 UNormalizationMode fUMode;
729 int32_t fOptions;
730
731 // The input text and our position in it
732 UCharIterator *text;
733
734 // The normalization buffer is the result of normalization
735 // of the source in [currentIndex..nextIndex[ .
736 int32_t currentIndex, nextIndex;
737
738 // A buffer for holding intermediate results
739 UnicodeString buffer;
740 int32_t bufferPos;
741
742 /**
743 * The address of this static class variable serves as this class's ID
744 * for ICU "poor man's RTTI".
745 */
746 static const char fgClassID;
747};
748
749//-------------------------------------------------------------------------
750// Inline implementations
751//-------------------------------------------------------------------------
752
753inline UClassID
754Normalizer::getStaticClassID()
755{ return (UClassID)&fgClassID; }
756
757inline UClassID
758Normalizer::getDynamicClassID() const
759{ return Normalizer::getStaticClassID(); }
760
761inline UBool
762Normalizer::operator!= (const Normalizer& other) const
763{ return ! operator==(other); }
764
765inline UNormalizationCheckResult
766Normalizer::quickCheck(const UnicodeString& source,
767 UNormalizationMode mode,
768 UErrorCode &status) {
769 if(U_FAILURE(status)) {
770 return UNORM_MAYBE;
771 }
772
773 return unorm_quickCheck(source.getBuffer(), source.length(),
774 mode, &status);
775}
776
777inline UNormalizationCheckResult
778Normalizer::quickCheck(const UnicodeString& source,
779 UNormalizationMode mode, int32_t options,
780 UErrorCode &status) {
781 if(U_FAILURE(status)) {
782 return UNORM_MAYBE;
783 }
784
785 return unorm_quickCheckWithOptions(source.getBuffer(), source.length(),
786 mode, options, &status);
787}
788
789inline UBool
790Normalizer::isNormalized(const UnicodeString& source,
791 UNormalizationMode mode,
792 UErrorCode &status) {
793 if(U_FAILURE(status)) {
794 return FALSE;
795 }
796
797 return unorm_isNormalized(source.getBuffer(), source.length(),
798 mode, &status);
799}
800
801inline UBool
802Normalizer::isNormalized(const UnicodeString& source,
803 UNormalizationMode mode, int32_t options,
804 UErrorCode &status) {
805 if(U_FAILURE(status)) {
806 return FALSE;
807 }
808
809 return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(),
810 mode, options, &status);
811}
812
813inline int32_t
814Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
815 uint32_t options,
816 UErrorCode &errorCode) {
817 // all argument checking is done in unorm_compare
818 return unorm_compare(s1.getBuffer(), s1.length(),
819 s2.getBuffer(), s2.length(),
820 options,
821 &errorCode);
822}
823
824U_NAMESPACE_END
825
826#endif /* #if !UCONFIG_NO_NORMALIZATION */
827
828#endif // NORMLZR_H