icuSources/common/unicode/unorm2.h

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2009-2015, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  unorm2.h
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2009dec15
  16 *   created by: Markus W. Scherer
  17 */
  18
  19 #ifndef __UNORM2_H__
  20 #define __UNORM2_H__
  21
  22 /**
  23  * \file
  24  * \brief C API: New API for Unicode Normalization.
  25  *
  26  * Unicode normalization functionality for standard Unicode normalization or
  27  * for using custom mapping tables.
  28  * All instances of UNormalizer2 are unmodifiable/immutable.
  29  * Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.
  30  * For more details see the Normalizer2 C++ class.
  31  */
  32
  33 #include "unicode/utypes.h"
  34 #include "unicode/localpointer.h"
  35 #include "unicode/stringoptions.h"
  36 #include "unicode/uset.h"
  37
  38 /**
  39  * Constants for normalization modes.
  40  * For details about standard Unicode normalization forms
  41  * and about the algorithms which are also used with custom mapping tables
  42  * see http://www.unicode.org/unicode/reports/tr15/
  43  * @stable ICU 4.4
  44  */
  45 typedef enum {
  46     /**
  47      * Decomposition followed by composition.
  48      * Same as standard NFC when using an "nfc" instance.
  49      * Same as standard NFKC when using an "nfkc" instance.
  50      * For details about standard Unicode normalization forms
  51      * see http://www.unicode.org/unicode/reports/tr15/
  52      * @stable ICU 4.4
  53      */
  54     UNORM2_COMPOSE,
  55     /**
  56      * Map, and reorder canonically.
  57      * Same as standard NFD when using an "nfc" instance.
  58      * Same as standard NFKD when using an "nfkc" instance.
  59      * For details about standard Unicode normalization forms
  60      * see http://www.unicode.org/unicode/reports/tr15/
  61      * @stable ICU 4.4
  62      */
  63     UNORM2_DECOMPOSE,
  64     /**
  65      * "Fast C or D" form.
  66      * If a string is in this form, then further decomposition <i>without reordering</i>
  67      * would yield the same form as DECOMPOSE.
  68      * Text in "Fast C or D" form can be processed efficiently with data tables
  69      * that are "canonically closed", that is, that provide equivalent data for
  70      * equivalent text, without having to be fully normalized.
  71      * Not a standard Unicode normalization form.
  72      * Not a unique form: Different FCD strings can be canonically equivalent.
  73      * For details see http://www.unicode.org/notes/tn5/#FCD
  74      * @stable ICU 4.4
  75      */
  76     UNORM2_FCD,
  77     /**
  78      * Compose only contiguously.
  79      * Also known as "FCC" or "Fast C Contiguous".
  80      * The result will often but not always be in NFC.
  81      * The result will conform to FCD which is useful for processing.
  82      * Not a standard Unicode normalization form.
  83      * For details see http://www.unicode.org/notes/tn5/#FCC
  84      * @stable ICU 4.4
  85      */
  86     UNORM2_COMPOSE_CONTIGUOUS
  87 } UNormalization2Mode;
  88
  89 /**
  90  * Result values for normalization quick check functions.
  91  * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
  92  * @stable ICU 2.0
  93  */
  94 typedef enum UNormalizationCheckResult {
  95   /**
  96    * The input string is not in the normalization form.
  97    * @stable ICU 2.0
  98    */
  99   UNORM_NO,
 100   /**
 101    * The input string is in the normalization form.
 102    * @stable ICU 2.0
 103    */
 104   UNORM_YES,
 105   /**
 106    * The input string may or may not be in the normalization form.
 107    * This value is only returned for composition forms like NFC and FCC,
 108    * when a backward-combining character is found for which the surrounding text
 109    * would have to be analyzed further.
 110    * @stable ICU 2.0
 111    */
 112   UNORM_MAYBE
 113 } UNormalizationCheckResult;
 114
 115 /**
 116  * Opaque C service object type for the new normalization API.
 117  * @stable ICU 4.4
 118  */
 119 struct UNormalizer2;
 120 typedef struct UNormalizer2 UNormalizer2;  /**< C typedef for struct UNormalizer2. @stable ICU 4.4 */
 121
 122 #if !UCONFIG_NO_NORMALIZATION
 123
 124 /**
 125  * Returns a UNormalizer2 instance for Unicode NFC normalization.
 126  * Same as unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, pErrorCode).
 127  * Returns an unmodifiable singleton instance. Do not delete it.
 128  * @param pErrorCode Standard ICU error code. Its input value must
 129  *                  pass the U_SUCCESS() test, or else the function returns
 130  *                  immediately. Check for U_FAILURE() on output or use with
 131  *                  function chaining. (See User Guide for details.)
 132  * @return the requested Normalizer2, if successful
 133  * @stable ICU 49
 134  */
 135 U_STABLE const UNormalizer2 * U_EXPORT2
 136 unorm2_getNFCInstance(UErrorCode *pErrorCode);
 137
 138 /**
 139  * Returns a UNormalizer2 instance for Unicode NFD normalization.
 140  * Same as unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, pErrorCode).
 141  * Returns an unmodifiable singleton instance. Do not delete it.
 142  * @param pErrorCode Standard ICU error code. Its input value must
 143  *                  pass the U_SUCCESS() test, or else the function returns
 144  *                  immediately. Check for U_FAILURE() on output or use with
 145  *                  function chaining. (See User Guide for details.)
 146  * @return the requested Normalizer2, if successful
 147  * @stable ICU 49
 148  */
 149 U_STABLE const UNormalizer2 * U_EXPORT2
 150 unorm2_getNFDInstance(UErrorCode *pErrorCode);
 151
 152 /**
 153  * Returns a UNormalizer2 instance for Unicode NFKC normalization.
 154  * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, pErrorCode).
 155  * Returns an unmodifiable singleton instance. Do not delete it.
 156  * @param pErrorCode Standard ICU error code. Its input value must
 157  *                  pass the U_SUCCESS() test, or else the function returns
 158  *                  immediately. Check for U_FAILURE() on output or use with
 159  *                  function chaining. (See User Guide for details.)
 160  * @return the requested Normalizer2, if successful
 161  * @stable ICU 49
 162  */
 163 U_STABLE const UNormalizer2 * U_EXPORT2
 164 unorm2_getNFKCInstance(UErrorCode *pErrorCode);
 165
 166 /**
 167  * Returns a UNormalizer2 instance for Unicode NFKD normalization.
 168  * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, pErrorCode).
 169  * Returns an unmodifiable singleton instance. Do not delete it.
 170  * @param pErrorCode Standard ICU error code. Its input value must
 171  *                  pass the U_SUCCESS() test, or else the function returns
 172  *                  immediately. Check for U_FAILURE() on output or use with
 173  *                  function chaining. (See User Guide for details.)
 174  * @return the requested Normalizer2, if successful
 175  * @stable ICU 49
 176  */
 177 U_STABLE const UNormalizer2 * U_EXPORT2
 178 unorm2_getNFKDInstance(UErrorCode *pErrorCode);
 179
 180 /**
 181  * Returns a UNormalizer2 instance for Unicode NFKC_Casefold normalization.
 182  * Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode).
 183  * Returns an unmodifiable singleton instance. Do not delete it.
 184  * @param pErrorCode Standard ICU error code. Its input value must
 185  *                  pass the U_SUCCESS() test, or else the function returns
 186  *                  immediately. Check for U_FAILURE() on output or use with
 187  *                  function chaining. (See User Guide for details.)
 188  * @return the requested Normalizer2, if successful
 189  * @stable ICU 49
 190  */
 191 U_STABLE const UNormalizer2 * U_EXPORT2
 192 unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode);
 193
 194 /**
 195  * Returns a UNormalizer2 instance which uses the specified data file
 196  * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
 197  * and which composes or decomposes text according to the specified mode.
 198  * Returns an unmodifiable singleton instance. Do not delete it.
 199  *
 200  * Use packageName=NULL for data files that are part of ICU's own data.
 201  * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
 202  * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
 203  * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
 204  *
 205  * @param packageName NULL for ICU built-in data, otherwise application data package name
 206  * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
 207  * @param mode normalization mode (compose or decompose etc.)
 208  * @param pErrorCode Standard ICU error code. Its input value must
 209  *                  pass the U_SUCCESS() test, or else the function returns
 210  *                  immediately. Check for U_FAILURE() on output or use with
 211  *                  function chaining. (See User Guide for details.)
 212  * @return the requested UNormalizer2, if successful
 213  * @stable ICU 4.4
 214  */
 215 U_STABLE const UNormalizer2 * U_EXPORT2
 216 unorm2_getInstance(const char *packageName,
 217                    const char *name,
 218                    UNormalization2Mode mode,
 219                    UErrorCode *pErrorCode);
 220
 221 /**
 222  * Constructs a filtered normalizer wrapping any UNormalizer2 instance
 223  * and a filter set.
 224  * Both are aliased and must not be modified or deleted while this object
 225  * is used.
 226  * The filter set should be frozen; otherwise the performance will suffer greatly.
 227  * @param norm2 wrapped UNormalizer2 instance
 228  * @param filterSet USet which determines the characters to be normalized
 229  * @param pErrorCode Standard ICU error code. Its input value must
 230  *                   pass the U_SUCCESS() test, or else the function returns
 231  *                   immediately. Check for U_FAILURE() on output or use with
 232  *                   function chaining. (See User Guide for details.)
 233  * @return the requested UNormalizer2, if successful
 234  * @stable ICU 4.4
 235  */
 236 U_STABLE UNormalizer2 * U_EXPORT2
 237 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode);
 238
 239 /**
 240  * Closes a UNormalizer2 instance from unorm2_openFiltered().
 241  * Do not close instances from unorm2_getInstance()!
 242  * @param norm2 UNormalizer2 instance to be closed
 243  * @stable ICU 4.4
 244  */
 245 U_STABLE void U_EXPORT2
 246 unorm2_close(UNormalizer2 *norm2);
 247
 248 #if U_SHOW_CPLUSPLUS_API
 249
 250 U_NAMESPACE_BEGIN
 251
 252 /**
 253  * \class LocalUNormalizer2Pointer
 254  * "Smart pointer" class, closes a UNormalizer2 via unorm2_close().
 255  * For most methods see the LocalPointerBase base class.
 256  *
 257  * @see LocalPointerBase
 258  * @see LocalPointer
 259  * @stable ICU 4.4
 260  */
 261 U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close);
 262
 263 U_NAMESPACE_END
 264
 265 #endif // U_SHOW_CPLUSPLUS_API
 266
 267 /**
 268  * Writes the normalized form of the source string to the destination string
 269  * (replacing its contents) and returns the length of the destination string.
 270  * The source and destination strings must be different buffers.
 271  * @param norm2 UNormalizer2 instance
 272  * @param src source string
 273  * @param length length of the source string, or -1 if NUL-terminated
 274  * @param dest destination string; its contents is replaced with normalized src
 275  * @param capacity number of UChars that can be written to dest
 276  * @param pErrorCode Standard ICU error code. Its input value must
 277  *                   pass the U_SUCCESS() test, or else the function returns
 278  *                   immediately. Check for U_FAILURE() on output or use with
 279  *                   function chaining. (See User Guide for details.)
 280  * @return dest
 281  * @stable ICU 4.4
 282  */
 283 U_STABLE int32_t U_EXPORT2
 284 unorm2_normalize(const UNormalizer2 *norm2,
 285                  const UChar *src, int32_t length,
 286                  UChar *dest, int32_t capacity,
 287                  UErrorCode *pErrorCode);
 288 /**
 289  * Appends the normalized form of the second string to the first string
 290  * (merging them at the boundary) and returns the length of the first string.
 291  * The result is normalized if the first string was normalized.
 292  * The first and second strings must be different buffers.
 293  * @param norm2 UNormalizer2 instance
 294  * @param first string, should be normalized
 295  * @param firstLength length of the first string, or -1 if NUL-terminated
 296  * @param firstCapacity number of UChars that can be written to first
 297  * @param second string, will be normalized
 298  * @param secondLength length of the source string, or -1 if NUL-terminated
 299  * @param pErrorCode Standard ICU error code. Its input value must
 300  *                   pass the U_SUCCESS() test, or else the function returns
 301  *                   immediately. Check for U_FAILURE() on output or use with
 302  *                   function chaining. (See User Guide for details.)
 303  * @return first
 304  * @stable ICU 4.4
 305  */
 306 U_STABLE int32_t U_EXPORT2
 307 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
 308                                 UChar *first, int32_t firstLength, int32_t firstCapacity,
 309                                 const UChar *second, int32_t secondLength,
 310                                 UErrorCode *pErrorCode);
 311 /**
 312  * Appends the second string to the first string
 313  * (merging them at the boundary) and returns the length of the first string.
 314  * The result is normalized if both the strings were normalized.
 315  * The first and second strings must be different buffers.
 316  * @param norm2 UNormalizer2 instance
 317  * @param first string, should be normalized
 318  * @param firstLength length of the first string, or -1 if NUL-terminated
 319  * @param firstCapacity number of UChars that can be written to first
 320  * @param second string, should be normalized
 321  * @param secondLength length of the source string, or -1 if NUL-terminated
 322  * @param pErrorCode Standard ICU error code. Its input value must
 323  *                   pass the U_SUCCESS() test, or else the function returns
 324  *                   immediately. Check for U_FAILURE() on output or use with
 325  *                   function chaining. (See User Guide for details.)
 326  * @return first
 327  * @stable ICU 4.4
 328  */
 329 U_STABLE int32_t U_EXPORT2
 330 unorm2_append(const UNormalizer2 *norm2,
 331               UChar *first, int32_t firstLength, int32_t firstCapacity,
 332               const UChar *second, int32_t secondLength,
 333               UErrorCode *pErrorCode);
 334
 335 /**
 336  * Gets the decomposition mapping of c.
 337  * Roughly equivalent to normalizing the String form of c
 338  * on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function
 339  * returns a negative value and does not write a string
 340  * if c does not have a decomposition mapping in this instance's data.
 341  * This function is independent of the mode of the UNormalizer2.
 342  * @param norm2 UNormalizer2 instance
 343  * @param c code point
 344  * @param decomposition String buffer which will be set to c's
 345  *                      decomposition mapping, if there is one.
 346  * @param capacity number of UChars that can be written to decomposition
 347  * @param pErrorCode Standard ICU error code. Its input value must
 348  *                   pass the U_SUCCESS() test, or else the function returns
 349  *                   immediately. Check for U_FAILURE() on output or use with
 350  *                   function chaining. (See User Guide for details.)
 351  * @return the non-negative length of c's decomposition, if there is one; otherwise a negative value
 352  * @stable ICU 4.6
 353  */
 354 U_STABLE int32_t U_EXPORT2
 355 unorm2_getDecomposition(const UNormalizer2 *norm2,
 356                         UChar32 c, UChar *decomposition, int32_t capacity,
 357                         UErrorCode *pErrorCode);
 358
 359 /**
 360  * Gets the raw decomposition mapping of c.
 361  *
 362  * This is similar to the unorm2_getDecomposition() function but returns the
 363  * raw decomposition mapping as specified in UnicodeData.txt or
 364  * (for custom data) in the mapping files processed by the gennorm2 tool.
 365  * By contrast, unorm2_getDecomposition() returns the processed,
 366  * recursively-decomposed version of this mapping.
 367  *
 368  * When used on a standard NFKC Normalizer2 instance,
 369  * unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
 370  *
 371  * When used on a standard NFC Normalizer2 instance,
 372  * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
 373  * in this case, the result contains either one or two code points (=1..4 UChars).
 374  *
 375  * This function is independent of the mode of the UNormalizer2.
 376  * @param norm2 UNormalizer2 instance
 377  * @param c code point
 378  * @param decomposition String buffer which will be set to c's
 379  *                      raw decomposition mapping, if there is one.
 380  * @param capacity number of UChars that can be written to decomposition
 381  * @param pErrorCode Standard ICU error code. Its input value must
 382  *                   pass the U_SUCCESS() test, or else the function returns
 383  *                   immediately. Check for U_FAILURE() on output or use with
 384  *                   function chaining. (See User Guide for details.)
 385  * @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value
 386  * @stable ICU 49
 387  */
 388 U_STABLE int32_t U_EXPORT2
 389 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
 390                            UChar32 c, UChar *decomposition, int32_t capacity,
 391                            UErrorCode *pErrorCode);
 392
 393 /**
 394  * Performs pairwise composition of a & b and returns the composite if there is one.
 395  *
 396  * Returns a composite code point c only if c has a two-way mapping to a+b.
 397  * In standard Unicode normalization, this means that
 398  * c has a canonical decomposition to a+b
 399  * and c does not have the Full_Composition_Exclusion property.
 400  *
 401  * This function is independent of the mode of the UNormalizer2.
 402  * @param norm2 UNormalizer2 instance
 403  * @param a A (normalization starter) code point.
 404  * @param b Another code point.
 405  * @return The non-negative composite code point if there is one; otherwise a negative value.
 406  * @stable ICU 49
 407  */
 408 U_STABLE UChar32 U_EXPORT2
 409 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b);
 410
 411 /**
 412  * Gets the combining class of c.
 413  * The default implementation returns 0
 414  * but all standard implementations return the Unicode Canonical_Combining_Class value.
 415  * @param norm2 UNormalizer2 instance
 416  * @param c code point
 417  * @return c's combining class
 418  * @stable ICU 49
 419  */
 420 U_STABLE uint8_t U_EXPORT2
 421 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c);
 422
 423 /**
 424  * Tests if the string is normalized.
 425  * Internally, in cases where the quickCheck() method would return "maybe"
 426  * (which is only possible for the two COMPOSE modes) this method
 427  * resolves to "yes" or "no" to provide a definitive result,
 428  * at the cost of doing more work in those cases.
 429  * @param norm2 UNormalizer2 instance
 430  * @param s input string
 431  * @param length length of the string, or -1 if NUL-terminated
 432  * @param pErrorCode Standard ICU error code. Its input value must
 433  *                   pass the U_SUCCESS() test, or else the function returns
 434  *                   immediately. Check for U_FAILURE() on output or use with
 435  *                   function chaining. (See User Guide for details.)
 436  * @return TRUE if s is normalized
 437  * @stable ICU 4.4
 438  */
 439 U_STABLE UBool U_EXPORT2
 440 unorm2_isNormalized(const UNormalizer2 *norm2,
 441                     const UChar *s, int32_t length,
 442                     UErrorCode *pErrorCode);
 443
 444 /**
 445  * Tests if the string is normalized.
 446  * For the two COMPOSE modes, the result could be "maybe" in cases that
 447  * would take a little more work to resolve definitively.
 448  * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
 449  * combination of quick check + normalization, to avoid
 450  * re-checking the "yes" prefix.
 451  * @param norm2 UNormalizer2 instance
 452  * @param s input string
 453  * @param length length of the string, or -1 if NUL-terminated
 454  * @param pErrorCode Standard ICU error code. Its input value must
 455  *                   pass the U_SUCCESS() test, or else the function returns
 456  *                   immediately. Check for U_FAILURE() on output or use with
 457  *                   function chaining. (See User Guide for details.)
 458  * @return UNormalizationCheckResult
 459  * @stable ICU 4.4
 460  */
 461 U_STABLE UNormalizationCheckResult U_EXPORT2
 462 unorm2_quickCheck(const UNormalizer2 *norm2,
 463                   const UChar *s, int32_t length,
 464                   UErrorCode *pErrorCode);
 465
 466 /**
 467  * Returns the end of the normalized substring of the input string.
 468  * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
 469  * the substring <code>UnicodeString(s, 0, end)</code>
 470  * will pass the quick check with a "yes" result.
 471  *
 472  * The returned end index is usually one or more characters before the
 473  * "no" or "maybe" character: The end index is at a normalization boundary.
 474  * (See the class documentation for more about normalization boundaries.)
 475  *
 476  * When the goal is a normalized string and most input strings are expected
 477  * to be normalized already, then call this method,
 478  * and if it returns a prefix shorter than the input string,
 479  * copy that prefix and use normalizeSecondAndAppend() for the remainder.
 480  * @param norm2 UNormalizer2 instance
 481  * @param s input string
 482  * @param length length of the string, or -1 if NUL-terminated
 483  * @param pErrorCode Standard ICU error code. Its input value must
 484  *                   pass the U_SUCCESS() test, or else the function returns
 485  *                   immediately. Check for U_FAILURE() on output or use with
 486  *                   function chaining. (See User Guide for details.)
 487  * @return "yes" span end index
 488  * @stable ICU 4.4
 489  */
 490 U_STABLE int32_t U_EXPORT2
 491 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
 492                          const UChar *s, int32_t length,
 493                          UErrorCode *pErrorCode);
 494
 495 /**
 496  * Tests if the character always has a normalization boundary before it,
 497  * regardless of context.
 498  * For details see the Normalizer2 base class documentation.
 499  * @param norm2 UNormalizer2 instance
 500  * @param c character to test
 501  * @return TRUE if c has a normalization boundary before it
 502  * @stable ICU 4.4
 503  */
 504 U_STABLE UBool U_EXPORT2
 505 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c);
 506
 507 /**
 508  * Tests if the character always has a normalization boundary after it,
 509  * regardless of context.
 510  * For details see the Normalizer2 base class documentation.
 511  * @param norm2 UNormalizer2 instance
 512  * @param c character to test
 513  * @return TRUE if c has a normalization boundary after it
 514  * @stable ICU 4.4
 515  */
 516 U_STABLE UBool U_EXPORT2
 517 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c);
 518
 519 /**
 520  * Tests if the character is normalization-inert.
 521  * For details see the Normalizer2 base class documentation.
 522  * @param norm2 UNormalizer2 instance
 523  * @param c character to test
 524  * @return TRUE if c is normalization-inert
 525  * @stable ICU 4.4
 526  */
 527 U_STABLE UBool U_EXPORT2
 528 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c);
 529
 530 /**
 531  * Compares two strings for canonical equivalence.
 532  * Further options include case-insensitive comparison and
 533  * code point order (as opposed to code unit order).
 534  *
 535  * Canonical equivalence between two strings is defined as their normalized
 536  * forms (NFD or NFC) being identical.
 537  * This function compares strings incrementally instead of normalizing
 538  * (and optionally case-folding) both strings entirely,
 539  * improving performance significantly.
 540  *
 541  * Bulk normalization is only necessary if the strings do not fulfill the FCD
 542  * conditions. Only in this case, and only if the strings are relatively long,
 543  * is memory allocated temporarily.
 544  * For FCD strings and short non-FCD strings there is no memory allocation.
 545  *
 546  * Semantically, this is equivalent to
 547  *   strcmp[CodePointOrder](NFD(foldCase(NFD(s1))), NFD(foldCase(NFD(s2))))
 548  * where code point order and foldCase are all optional.
 549  *
 550  * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
 551  * the case folding must be performed first, then the normalization.
 552  *
 553  * @param s1 First source string.
 554  * @param length1 Length of first source string, or -1 if NUL-terminated.
 555  *
 556  * @param s2 Second source string.
 557  * @param length2 Length of second source string, or -1 if NUL-terminated.
 558  *
 559  * @param options A bit set of options:
 560  *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
 561  *     Case-sensitive comparison in code unit order, and the input strings
 562  *     are quick-checked for FCD.
 563  *
 564  *   - UNORM_INPUT_IS_FCD
 565  *     Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
 566  *     If not set, the function will quickCheck for FCD
 567  *     and normalize if necessary.
 568  *
 569  *   - U_COMPARE_CODE_POINT_ORDER
 570  *     Set to choose code point order instead of code unit order
 571  *     (see u_strCompare for details).
 572  *
 573  *   - U_COMPARE_IGNORE_CASE
 574  *     Set to compare strings case-insensitively using case folding,
 575  *     instead of case-sensitively.
 576  *     If set, then the following case folding options are used.
 577  *
 578  *   - Options as used with case-insensitive comparisons, currently:
 579  *
 580  *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
 581  *    (see u_strCaseCompare for details)
 582  *
 583  *   - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
 584  *
 585  * @param pErrorCode ICU error code in/out parameter.
 586  *                   Must fulfill U_SUCCESS before the function call.
 587  * @return <0 or 0 or >0 as usual for string comparisons
 588  *
 589  * @see unorm_normalize
 590  * @see UNORM_FCD
 591  * @see u_strCompare
 592  * @see u_strCaseCompare
 593  *
 594  * @stable ICU 2.2
 595  */
 596 U_STABLE int32_t U_EXPORT2
 597 unorm_compare(const UChar *s1, int32_t length1,
 598               const UChar *s2, int32_t length2,
 599               uint32_t options,
 600               UErrorCode *pErrorCode);
 601
 602 #endif  /* !UCONFIG_NO_NORMALIZATION */
 603 #endif  /* __UNORM2_H__ */