/*
********************************************************************
- * COPYRIGHT:
- * Copyright (c) 1996-2003, International Business Machines Corporation and
+ * COPYRIGHT:
+ * Copyright (c) 1996-2011, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*/
#include "unicode/utypes.h"
+/**
+ * \file
+ * \brief C++ API: Unicode Normalization
+ */
+
#if !UCONFIG_NO_NORMALIZATION
-#include "unicode/uobject.h"
-#include "unicode/unistr.h"
#include "unicode/chariter.h"
+#include "unicode/normalizer2.h"
+#include "unicode/unistr.h"
#include "unicode/unorm.h"
-
-struct UCharIterator;
-typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
+#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
/**
- * \brief C++ API: Unicode Normalization
+ * The Normalizer class supports the standard normalization forms described in
+ * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
+ * Unicode Standard Annex #15: Unicode Normalization Forms</a>.
+ *
+ * Note: This API has been replaced by the Normalizer2 class and is only available
+ * for backward compatibility. This class simply delegates to the Normalizer2 class.
+ * There is one exception: The new API does not provide a replacement for Normalizer::compare().
*
* The Normalizer class consists of two parts:
* - static functions that normalize strings or test if strings are normalized
*
* The Normalizer class is not suitable for subclassing.
*
- * The static functions are basically wrappers around the C implementation,
- * using UnicodeString instead of UChar*.
* For basic information about normalization forms and details about the C API
* please see the documentation in unorm.h.
*
* The iterator API with the Normalizer constructors and the non-static functions
- * uses a CharacterIterator as input. It is possible to pass a string which
+ * use a CharacterIterator as input. It is possible to pass a string which
* is then internally wrapped in a CharacterIterator.
* The input text is not normalized all at once, but incrementally where needed
* (providing efficient random access).
* then the internal index is 0 and one can return to this getIndex()
* later with setIndexOnly().
*
+ * Note: While the setIndex() and getIndex() refer to indices in the
+ * underlying Unicode input text, the next() and previous() methods
+ * iterate through characters in the normalized output.
+ * This means that there is not necessarily a one-to-one correspondence
+ * between characters returned by next() and previous() and the indices
+ * passed to and returned from setIndex() and getIndex().
+ * It is for this reason that Normalizer does not implement the CharacterIterator interface.
+ *
* @author Laura Werner, Mark Davis, Markus Scherer
* @stable ICU 2.0
*/
* @stable ICU 2.0
*/
Normalizer(const UnicodeString& str, UNormalizationMode mode);
-
+
/**
* Creates a new <code>Normalizer</code> object for iterating over the
* normalized form of a given string.
* Destructor
* @stable ICU 2.0
*/
- ~Normalizer();
+ virtual ~Normalizer();
//-------------------------------------------------------------------------
* @param status The error code.
* @stable ICU 2.0
*/
- static void normalize(const UnicodeString& source,
+ static void U_EXPORT2 normalize(const UnicodeString& source,
UNormalizationMode mode, int32_t options,
UnicodeString& result,
UErrorCode &status);
* @param status The error code.
* @stable ICU 2.0
*/
- static void compose(const UnicodeString& source,
+ static void U_EXPORT2 compose(const UnicodeString& source,
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status);
* @param status The error code.
* @stable ICU 2.0
*/
- static void decompose(const UnicodeString& source,
+ static void U_EXPORT2 decompose(const UnicodeString& source,
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status);
/**
- * Performing quick check on a string, to quickly determine if the string is
+ * Performing quick check on a string, to quickly determine if the string is
* in a particular normalization format.
* This is a wrapper for unorm_quickCheck(), using a UnicodeString.
*
* Three types of result can be returned UNORM_YES, UNORM_NO or
* UNORM_MAYBE. Result UNORM_YES indicates that the argument
* string is in the desired normalized format, UNORM_NO determines that
- * argument string is not in the desired normalized format. A
- * UNORM_MAYBE result indicates that a more thorough check is required,
- * the user may have to put the string in its normalized form and compare the
+ * argument string is not in the desired normalized format. A
+ * UNORM_MAYBE result indicates that a more thorough check is required,
+ * the user may have to put the string in its normalized form and compare the
* results.
* @param source string for determining if it is in a normalized format
* @param mode normalization format
* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
*
* @see isNormalized
- * @draft ICU 2.6
+ * @stable ICU 2.6
*/
- static inline UNormalizationCheckResult
+ static UNormalizationCheckResult
quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
/**
* "mode" normalization form.
*
* @see quickCheck
- * @draft ICU 2.2
+ * @stable ICU 2.2
*/
static inline UBool
isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
* "mode" normalization form.
*
* @see quickCheck
- * @draft ICU 2.6
+ * @stable ICU 2.6
*/
- static inline UBool
+ static UBool
isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
/**
* @stable ICU 2.1
*/
static UnicodeString &
- concatenate(UnicodeString &left, UnicodeString &right,
+ U_EXPORT2 concatenate(const UnicodeString &left, const UnicodeString &right,
UnicodeString &result,
UNormalizationMode mode, int32_t options,
UErrorCode &errorCode);
* @see u_strCompare
* @see u_strCaseCompare
*
- * @draft ICU 2.2
+ * @stable ICU 2.2
*/
static inline int32_t
compare(const UnicodeString &s1, const UnicodeString &s2,
//-------------------------------------------------------------------------
// Iteration API
//-------------------------------------------------------------------------
-
+
/**
* Return the current character in the normalized text.
* current() may need to normalize some text at getIndex().
/**
* Returns a pointer to a new Normalizer that is a clone of this one.
* The caller is responsible for deleting the new clone.
- * @return a pointer to a new Normalizer
+ * @return a pointer to a new Normalizer
* @stable ICU 2.0
*/
Normalizer* clone(void) const;
* Set the normalization mode for this object.
* <p>
* <b>Note:</b>If the normalization mode is changed while iterating
- * over a string, calls to {@link next()} and {@link previous()} may
+ * over a string, calls to {@link #next() } and {@link #previous() } may
* return previously buffers characters in the old normalization mode
* until the iteration is able to re-sync at the next base character.
- * It is safest to call {@link setIndexOnly()}, {@link reset()},
- * {@link setText()}, {@link first()},
- * {@link last()}, etc. after calling <code>setMode</code>.
+ * It is safest to call {@link #setIndexOnly }, {@link #reset() },
+ * {@link #setText }, {@link #first() },
+ * {@link #last() }, etc. after calling <code>setMode</code>.
* <p>
* @param newMode the new mode for this <code>Normalizer</code>.
* @see #getUMode
* @see #getOption
* @stable ICU 2.0
*/
- void setOption(int32_t option,
+ void setOption(int32_t option,
UBool value);
/**
* @param status a UErrorCode
* @stable ICU 2.0
*/
- void setText(const UnicodeString& newText,
+ void setText(const UnicodeString& newText,
UErrorCode &status);
/**
* @param status a UErrorCode
* @stable ICU 2.0
*/
- void setText(const CharacterIterator& newText,
+ void setText(const CharacterIterator& newText,
UErrorCode &status);
/**
void getText(UnicodeString& result);
/**
- * ICU "poor man's RTTI", returns a UClassID for the actual class.
- * @return a UClassID for the actual class.
- * @draft ICU 2.2
+ * ICU "poor man's RTTI", returns a UClassID for this class.
+ * @returns a UClassID for this class.
+ * @stable ICU 2.2
*/
- virtual inline UClassID getDynamicClassID() const;
+ static UClassID U_EXPORT2 getStaticClassID();
/**
- * ICU "poor man's RTTI", returns a UClassID for this class.
- * @returns a UClassID for this class.
- * @draft ICU 2.2
+ * ICU "poor man's RTTI", returns a UClassID for the actual class.
+ * @return a UClassID for the actual class.
+ * @stable ICU 2.2
*/
- static inline UClassID getStaticClassID();
+ virtual UClassID getDynamicClassID() const;
private:
//-------------------------------------------------------------------------
UBool nextNormalize();
UBool previousNormalize();
- void init(CharacterIterator *iter);
+ void init();
void clearBuffer(void);
//-------------------------------------------------------------------------
// Private data
//-------------------------------------------------------------------------
+ FilteredNormalizer2*fFilteredNorm2; // owned if not NULL
+ const Normalizer2 *fNorm2; // not owned; may be equal to fFilteredNorm2
UNormalizationMode fUMode;
int32_t fOptions;
// The input text and our position in it
- UCharIterator *text;
+ CharacterIterator *text;
// The normalization buffer is the result of normalization
// of the source in [currentIndex..nextIndex[ .
// A buffer for holding intermediate results
UnicodeString buffer;
int32_t bufferPos;
-
- /**
- * The address of this static class variable serves as this class's ID
- * for ICU "poor man's RTTI".
- */
- static const char fgClassID;
};
//-------------------------------------------------------------------------
// Inline implementations
//-------------------------------------------------------------------------
-inline UClassID
-Normalizer::getStaticClassID()
-{ return (UClassID)&fgClassID; }
-
-inline UClassID
-Normalizer::getDynamicClassID() const
-{ return Normalizer::getStaticClassID(); }
-
inline UBool
Normalizer::operator!= (const Normalizer& other) const
{ return ! operator==(other); }
inline UNormalizationCheckResult
Normalizer::quickCheck(const UnicodeString& source,
- UNormalizationMode mode,
- UErrorCode &status) {
- if(U_FAILURE(status)) {
- return UNORM_MAYBE;
- }
-
- return unorm_quickCheck(source.getBuffer(), source.length(),
- mode, &status);
-}
-
-inline UNormalizationCheckResult
-Normalizer::quickCheck(const UnicodeString& source,
- UNormalizationMode mode, int32_t options,
+ UNormalizationMode mode,
UErrorCode &status) {
- if(U_FAILURE(status)) {
- return UNORM_MAYBE;
- }
-
- return unorm_quickCheckWithOptions(source.getBuffer(), source.length(),
- mode, options, &status);
-}
-
-inline UBool
-Normalizer::isNormalized(const UnicodeString& source,
- UNormalizationMode mode,
- UErrorCode &status) {
- if(U_FAILURE(status)) {
- return FALSE;
- }
-
- return unorm_isNormalized(source.getBuffer(), source.length(),
- mode, &status);
+ return quickCheck(source, mode, 0, status);
}
inline UBool
Normalizer::isNormalized(const UnicodeString& source,
- UNormalizationMode mode, int32_t options,
+ UNormalizationMode mode,
UErrorCode &status) {
- if(U_FAILURE(status)) {
- return FALSE;
- }
-
- return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(),
- mode, options, &status);
+ return isNormalized(source, mode, 0, status);
}
inline int32_t