ICU-59173.0.1.tar.gz

[apple/icu.git] / icuSources / common / unormimp.h
diff --git a/icuSources/common/unormimp.h b/icuSources/common/unormimp.h

index 284bca59e196d3a966c74668198a86ccfb2bc073..88c7975cc4e783593357f1dac1058273f01df002 100644 (file)
--- a/icuSources/common/unormimp.h
+++ b/icuSources/common/unormimp.h
@@ -1,12 +1,14 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2001-2003, International Business Machines
+*   Copyright (C) 2001-2011, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
  *   file name:  unormimp.h
-*   encoding:   US-ASCII
+*   encoding:   UTF-8
  *   tab size:   8 (not used)
  *   indentation:4
  *
@@ -21,15 +23,11 @@
  
  #if !UCONFIG_NO_NORMALIZATION
  
-#include "unicode/uiter.h"
-#include "unicode/unorm.h"
-#include "unicode/uset.h"
-#include "utrie.h"
-#include "ustr_imp.h"
+#include "udataswp.h"
  
  /*
- * This new implementation of the normalization code loads its data from
- * unorm.dat, which is generated with the gennorm tool.
+ * The 2001-2010 implementation of the normalization code loads its data from
+ * unorm.icu, which is generated with the gennorm tool.
   * The format of that file is described at the end of this file.
   */
  
@@ -91,11 +89,19 @@ enum {
  
  /* canonStartSets[0..31] contains indexes for what is in the array */
  enum {
-    _NORM_SET_INDEX_CANON_SETS_LENGTH,  /* number of uint16_t in canonical starter sets */
+    _NORM_SET_INDEX_CANON_SETS_LENGTH,      /* number of uint16_t in canonical starter sets */
      _NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */
      _NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */
  
-    _NORM_SET_INDEX_TOP=32              /* changing this requires a new formatVersion */
+    /* from formatVersion 2.3: */
+    _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,   /* uint16_t offset from canonStartSets[0] to the
+                                               exclusion set for CJK compatibility characters */
+    _NORM_SET_INDEX_NX_UNICODE32_OFFSET,    /* uint16_t offset from canonStartSets[0] to the
+                                               exclusion set for Unicode 3.2 characters */
+    _NORM_SET_INDEX_NX_RESERVED_OFFSET,     /* uint16_t offset from canonStartSets[0] to the
+                                               end of the previous exclusion set */
+
+    _NORM_SET_INDEX_TOP=32                  /* changing this requires a new formatVersion */
  };
  
  /* more constants for canonical starter sets */
@@ -150,235 +156,23 @@ enum {
      _NORM_DECOMP_LENGTH_MASK=0x7f
  };
  
-#endif /* #if !UCONFIG_NO_NORMALIZATION */
-
-/* Korean Hangul and Jamo constants */
-enum {
-    JAMO_L_BASE=0x1100,     /* "lead" jamo */
-    JAMO_V_BASE=0x1161,     /* "vowel" jamo */
-    JAMO_T_BASE=0x11a7,     /* "trail" jamo */
-
-    HANGUL_BASE=0xac00,
-
-    JAMO_L_COUNT=19,
-    JAMO_V_COUNT=21,
-    JAMO_T_COUNT=28,
-
-    HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT
-};
-
-#if !UCONFIG_NO_NORMALIZATION
-
-/* Constants for options flags for normalization. @draft ICU 2.6 */
+/** Constants for options flags for normalization. */
  enum {
-    /** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */
+    /** Options bit 0, do not decompose Hangul syllables. */
      UNORM_NX_HANGUL=1,
-    /** Options bit 1, do not decompose CJK compatibility characters. @draft ICU 2.6 */
+    /** Options bit 1, do not decompose CJK compatibility characters. */
      UNORM_NX_CJK_COMPAT=2
  };
  
  /**
- * Is the normalizer data loaded?
- * This is used internally before other internal normalizer functions
- * are called.
- * It saves this check in each of many normalization calls that
- * are made for, e.g., collation.
- *
- * @param pErrorCode as usual
- * @return boolean value for whether the normalization data is loaded
- *
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_haveData(UErrorCode *pErrorCode);
-
-/**
- * Internal API for normalizing.
- * Does not check for bad input.
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_internalNormalize(UChar *dest, int32_t destCapacity,
-                        const UChar *src, int32_t srcLength,
-                        UNormalizationMode mode, int32_t options,
-                        UErrorCode *pErrorCode);
-
-/**
- * internal API, used by normlzr.cpp
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_decompose(UChar *dest, int32_t destCapacity,
-                const UChar *src, int32_t srcLength,
-                UBool compat, int32_t options,
-                UErrorCode *pErrorCode);
-
-/**
- * internal API, used by normlzr.cpp
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_compose(UChar *dest, int32_t destCapacity,
-              const UChar *src, int32_t srcLength,
-              UBool compat, int32_t options,
-              UErrorCode *pErrorCode);
-
-#endif /* #if !UCONFIG_NO_NORMALIZATION */
-
-/**
- * Internal option for unorm_cmpEquivFold() for decomposing.
- * If not set, just do strcasecmp().
- * @internal
- */
-#define _COMPARE_EQUIV 0x80000
-
-#ifndef U_COMPARE_IGNORE_CASE
-/* see also unorm.h */
-/**
- * Option bit for unorm_compare:
- * Perform case-insensitive comparison.
- * @draft ICU 2.2
- */
-#define U_COMPARE_IGNORE_CASE       0x10000
-#endif
-
-/**
- * Internal option for unorm_cmpEquivFold() for strncmp style.
- * If set, checks for both string length and terminating NUL.
- * @internal
- */
-#define _STRNCMP_STYLE 0x1000
-
-/**
- * Internal API, used by u_strcasecmp() etc.
- * Compare strings for canonical equivalence (optional),
- * case-insensitively (optional),
- * in code point order or code unit order.
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_cmpEquivFold(const UChar *s1, int32_t length1,
-                   const UChar *s2, int32_t length2,
-                   uint32_t options,
-                   UErrorCode *pErrorCode);
-
-#if !UCONFIG_NO_NORMALIZATION
-
-/**
- * Internal API, used by collation code.
- * Get access to the internal FCD trie table to be able to perform
- * incremental, per-code unit, FCD checks in collation.
- * One pointer is sufficient because the trie index values are offset
- * by the index size, so that the same pointer is used to access the trie data.
- * @internal
- */
-U_CAPI const uint16_t * U_EXPORT2
-unorm_getFCDTrie(UErrorCode *pErrorCode);
-
-#ifdef XP_CPLUSPLUS
-
-U_NAMESPACE_BEGIN
-/**
- * Internal API, used by collation code.
- * Get the FCD value for a code unit, with
- * bits 15..8   lead combining class
- * bits  7..0   trail combining class
- *
- * If c is a lead surrogate and the value is not 0,
- * then instead of combining classes the value
- * is used in unorm_getFCD16FromSurrogatePair() to get the real value
- * of the supplementary code point.
- *
- * @internal
- */
-inline uint16_t
-unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
-    return
-        fcdTrieIndex[
-            (fcdTrieIndex[
-                c>>UTRIE_SHIFT
-            ]<<UTRIE_INDEX_SHIFT)+
-            (c&UTRIE_MASK)
-        ];
-}
-
-/**
- * Internal API, used by collation code.
- * Get the FCD value for a supplementary code point, with
- * bits 15..8   lead combining class
- * bits  7..0   trail combining class
- *
- * @param fcd16  The FCD value for the lead surrogate, not 0.
- * @param c2     The trail surrogate code unit.
- *
- * @internal
- */
-inline uint16_t
-unorm_getFCD16FromSurrogatePair(const uint16_t *fcdTrieIndex, uint16_t fcd16, UChar c2) {
-    return
-        fcdTrieIndex[
-            (fcdTrieIndex[
-                (int32_t)fcd16+((c2&0x3ff)>>UTRIE_SHIFT)
-            ]<<UTRIE_INDEX_SHIFT)+
-            (c2&UTRIE_MASK)
-        ];
-}
-
-U_NAMESPACE_END
-
-#endif
-
-/**
- * internal API, used by the canonical iterator
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_getDecomposition(UChar32 c, UBool compat,
-                       UChar *dest, int32_t destCapacity);
-
-/**
- * internal API, used by uprops.cpp
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_internalIsFullCompositionExclusion(UChar32 c);
-
-/**
- * Internal API, used by enumeration of canonically equivalent strings
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_isCanonSafeStart(UChar32 c);
-
-/**
- * Internal API, used by enumeration of canonically equivalent strings
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
-
-/**
- * Is c an NF<mode>-skippable code point? See unormimp.h.
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_isNFSkippable(UChar32 c, UNormalizationMode mode);
-
-/**
- * Enumerate each normalization data trie and add the
- * start of each range of same properties to the set.
- * @internal
- */
-U_CAPI void U_EXPORT2
-unorm_addPropertyStarts(USet *set, UErrorCode *pErrorCode);
-
-/**
- * Description of the format of unorm.dat version 2.2.
+ * Description of the format of unorm.icu version 2.3.
   *
   * Main change from version 1 to version 2:
   * Use of new, common UTrie instead of normalization-specific tries.
   * Change to version 2.1: add third/auxiliary trie with associated data.
   * Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK).
+ * Change to version 2.3: add serialized sets for normalization exclusions
+ *                        stored inside canonStartSets[]
   *
   * For more details of how to use the data structures see the code
   * in unorm.cpp (runtime normalization code) and
@@ -391,7 +185,7 @@ unorm_addPropertyStarts(USet *set, UErrorCode *pErrorCode);
   * unorm.dat customarily begins with a UDataInfo structure, see udata.h and .c.
   * After that there are the following structures:
   *
- * uint16_t indexes[_NORM_INDEX_TOP];           -- _NORM_INDEX_TOP=32, see enum in this file
+ * int32_t indexes[_NORM_INDEX_TOP];            -- _NORM_INDEX_TOP=32, see enum in this file
   *
   * UTrie normTrie;                              -- size in bytes=indexes[_NORM_INDEX_TRIE_SIZE]
   * 
@@ -662,6 +456,31 @@ unorm_addPropertyStarts(USet *set, UErrorCode *pErrorCode);
   *     if the high word has bit 15 set, then build a set with a single code point
   *     which is (((high16(cp)&0x1f00)<<8)|result;
   *     else there is a USerializedSet at canonStartSets+result
+ *
+ * FormatVersion 2.3 adds 2 serialized sets for normalization exclusions.
+ * They are stored in the data file so that the runtime normalization code need
+ * not depend on other properties and their data and implementation files.
+ * The _NORM_SET_INDEX_NX_..._OFFSET offsets in the canonStartSets index table
+ * give the location for each set.
+ * There is no set stored for UNORM_NX_HANGUL because it's trivial to create
+ * without using properties.
+ *
+ * Set contents:
+ *
+ * _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET (for UNORM_NX_CJK_COMPAT)
+ *     [[:Ideographic:]&[:NFD_QC=No:]]
+ *     =[CJK Ideographs]&[has canonical decomposition]
+ *
+ * _NORM_SET_INDEX_NX_UNICODE32_OFFSET (for UNORM_UNICODE_3_2)
+ *     [:^Age=3.2:]
+ *     =set with all code points that were not designated by the specified Unicode version
+ *
+ * _NORM_SET_INDEX_NX_RESERVED_OFFSET
+ *     This is an offset that points to where the next, future set would start.
+ *     Currently it indicates where the previous set ends, and thus its length.
+ *     The name for this enum constant may in the future be applied to different
+ *     index slots. In order to get the limit of a set, use its index slot and
+ *     the immediately following one regardless of that one's enum name.
   */
  
  #endif /* #if !UCONFIG_NO_NORMALIZATION */