+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
-* Copyright (C) 2001-2003, International Business Machines
+* Copyright (C) 2001-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: unormimp.h
-* encoding: US-ASCII
+* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
#if !UCONFIG_NO_NORMALIZATION
-#include "unicode/uiter.h"
-#include "unicode/unorm.h"
-#include "unicode/uset.h"
-#include "utrie.h"
-#include "ustr_imp.h"
+#include "udataswp.h"
/*
- * This new implementation of the normalization code loads its data from
- * unorm.dat, which is generated with the gennorm tool.
+ * The 2001-2010 implementation of the normalization code loads its data from
+ * unorm.icu, which is generated with the gennorm tool.
* The format of that file is described at the end of this file.
*/
/* canonStartSets[0..31] contains indexes for what is in the array */
enum {
- _NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */
+ _NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */
_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */
_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */
- _NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */
+ /* from formatVersion 2.3: */
+ _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, /* uint16_t offset from canonStartSets[0] to the
+ exclusion set for CJK compatibility characters */
+ _NORM_SET_INDEX_NX_UNICODE32_OFFSET, /* uint16_t offset from canonStartSets[0] to the
+ exclusion set for Unicode 3.2 characters */
+ _NORM_SET_INDEX_NX_RESERVED_OFFSET, /* uint16_t offset from canonStartSets[0] to the
+ end of the previous exclusion set */
+
+ _NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */
};
/* more constants for canonical starter sets */
_NORM_DECOMP_LENGTH_MASK=0x7f
};
-#endif /* #if !UCONFIG_NO_NORMALIZATION */
-
-/* Korean Hangul and Jamo constants */
-enum {
- JAMO_L_BASE=0x1100, /* "lead" jamo */
- JAMO_V_BASE=0x1161, /* "vowel" jamo */
- JAMO_T_BASE=0x11a7, /* "trail" jamo */
-
- HANGUL_BASE=0xac00,
-
- JAMO_L_COUNT=19,
- JAMO_V_COUNT=21,
- JAMO_T_COUNT=28,
-
- HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT
-};
-
-#if !UCONFIG_NO_NORMALIZATION
-
-/* Constants for options flags for normalization. @draft ICU 2.6 */
+/** Constants for options flags for normalization. */
enum {
- /** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */
+ /** Options bit 0, do not decompose Hangul syllables. */
UNORM_NX_HANGUL=1,
- /** Options bit 1, do not decompose CJK compatibility characters. @draft ICU 2.6 */
+ /** Options bit 1, do not decompose CJK compatibility characters. */
UNORM_NX_CJK_COMPAT=2
};
/**
- * Is the normalizer data loaded?
- * This is used internally before other internal normalizer functions
- * are called.
- * It saves this check in each of many normalization calls that
- * are made for, e.g., collation.
- *
- * @param pErrorCode as usual
- * @return boolean value for whether the normalization data is loaded
- *
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_haveData(UErrorCode *pErrorCode);
-
-/**
- * Internal API for normalizing.
- * Does not check for bad input.
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_internalNormalize(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- UNormalizationMode mode, int32_t options,
- UErrorCode *pErrorCode);
-
-/**
- * internal API, used by normlzr.cpp
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_decompose(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- UBool compat, int32_t options,
- UErrorCode *pErrorCode);
-
-/**
- * internal API, used by normlzr.cpp
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_compose(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- UBool compat, int32_t options,
- UErrorCode *pErrorCode);
-
-#endif /* #if !UCONFIG_NO_NORMALIZATION */
-
-/**
- * Internal option for unorm_cmpEquivFold() for decomposing.
- * If not set, just do strcasecmp().
- * @internal
- */
-#define _COMPARE_EQUIV 0x80000
-
-#ifndef U_COMPARE_IGNORE_CASE
-/* see also unorm.h */
-/**
- * Option bit for unorm_compare:
- * Perform case-insensitive comparison.
- * @draft ICU 2.2
- */
-#define U_COMPARE_IGNORE_CASE 0x10000
-#endif
-
-/**
- * Internal option for unorm_cmpEquivFold() for strncmp style.
- * If set, checks for both string length and terminating NUL.
- * @internal
- */
-#define _STRNCMP_STYLE 0x1000
-
-/**
- * Internal API, used by u_strcasecmp() etc.
- * Compare strings for canonical equivalence (optional),
- * case-insensitively (optional),
- * in code point order or code unit order.
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_cmpEquivFold(const UChar *s1, int32_t length1,
- const UChar *s2, int32_t length2,
- uint32_t options,
- UErrorCode *pErrorCode);
-
-#if !UCONFIG_NO_NORMALIZATION
-
-/**
- * Internal API, used by collation code.
- * Get access to the internal FCD trie table to be able to perform
- * incremental, per-code unit, FCD checks in collation.
- * One pointer is sufficient because the trie index values are offset
- * by the index size, so that the same pointer is used to access the trie data.
- * @internal
- */
-U_CAPI const uint16_t * U_EXPORT2
-unorm_getFCDTrie(UErrorCode *pErrorCode);
-
-#ifdef XP_CPLUSPLUS
-
-U_NAMESPACE_BEGIN
-/**
- * Internal API, used by collation code.
- * Get the FCD value for a code unit, with
- * bits 15..8 lead combining class
- * bits 7..0 trail combining class
- *
- * If c is a lead surrogate and the value is not 0,
- * then instead of combining classes the value
- * is used in unorm_getFCD16FromSurrogatePair() to get the real value
- * of the supplementary code point.
- *
- * @internal
- */
-inline uint16_t
-unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
- return
- fcdTrieIndex[
- (fcdTrieIndex[
- c>>UTRIE_SHIFT
- ]<<UTRIE_INDEX_SHIFT)+
- (c&UTRIE_MASK)
- ];
-}
-
-/**
- * Internal API, used by collation code.
- * Get the FCD value for a supplementary code point, with
- * bits 15..8 lead combining class
- * bits 7..0 trail combining class
- *
- * @param fcd16 The FCD value for the lead surrogate, not 0.
- * @param c2 The trail surrogate code unit.
- *
- * @internal
- */
-inline uint16_t
-unorm_getFCD16FromSurrogatePair(const uint16_t *fcdTrieIndex, uint16_t fcd16, UChar c2) {
- return
- fcdTrieIndex[
- (fcdTrieIndex[
- (int32_t)fcd16+((c2&0x3ff)>>UTRIE_SHIFT)
- ]<<UTRIE_INDEX_SHIFT)+
- (c2&UTRIE_MASK)
- ];
-}
-
-U_NAMESPACE_END
-
-#endif
-
-/**
- * internal API, used by the canonical iterator
- * @internal
- */
-U_CAPI int32_t U_EXPORT2
-unorm_getDecomposition(UChar32 c, UBool compat,
- UChar *dest, int32_t destCapacity);
-
-/**
- * internal API, used by uprops.cpp
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_internalIsFullCompositionExclusion(UChar32 c);
-
-/**
- * Internal API, used by enumeration of canonically equivalent strings
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_isCanonSafeStart(UChar32 c);
-
-/**
- * Internal API, used by enumeration of canonically equivalent strings
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
-
-/**
- * Is c an NF<mode>-skippable code point? See unormimp.h.
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-unorm_isNFSkippable(UChar32 c, UNormalizationMode mode);
-
-/**
- * Enumerate each normalization data trie and add the
- * start of each range of same properties to the set.
- * @internal
- */
-U_CAPI void U_EXPORT2
-unorm_addPropertyStarts(USet *set, UErrorCode *pErrorCode);
-
-/**
- * Description of the format of unorm.dat version 2.2.
+ * Description of the format of unorm.icu version 2.3.
*
* Main change from version 1 to version 2:
* Use of new, common UTrie instead of normalization-specific tries.
* Change to version 2.1: add third/auxiliary trie with associated data.
* Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK).
+ * Change to version 2.3: add serialized sets for normalization exclusions
+ * stored inside canonStartSets[]
*
* For more details of how to use the data structures see the code
* in unorm.cpp (runtime normalization code) and
* unorm.dat customarily begins with a UDataInfo structure, see udata.h and .c.
* After that there are the following structures:
*
- * uint16_t indexes[_NORM_INDEX_TOP]; -- _NORM_INDEX_TOP=32, see enum in this file
+ * int32_t indexes[_NORM_INDEX_TOP]; -- _NORM_INDEX_TOP=32, see enum in this file
*
* UTrie normTrie; -- size in bytes=indexes[_NORM_INDEX_TRIE_SIZE]
*
* if the high word has bit 15 set, then build a set with a single code point
* which is (((high16(cp)&0x1f00)<<8)|result;
* else there is a USerializedSet at canonStartSets+result
+ *
+ * FormatVersion 2.3 adds 2 serialized sets for normalization exclusions.
+ * They are stored in the data file so that the runtime normalization code need
+ * not depend on other properties and their data and implementation files.
+ * The _NORM_SET_INDEX_NX_..._OFFSET offsets in the canonStartSets index table
+ * give the location for each set.
+ * There is no set stored for UNORM_NX_HANGUL because it's trivial to create
+ * without using properties.
+ *
+ * Set contents:
+ *
+ * _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET (for UNORM_NX_CJK_COMPAT)
+ * [[:Ideographic:]&[:NFD_QC=No:]]
+ * =[CJK Ideographs]&[has canonical decomposition]
+ *
+ * _NORM_SET_INDEX_NX_UNICODE32_OFFSET (for UNORM_UNICODE_3_2)
+ * [:^Age=3.2:]
+ * =set with all code points that were not designated by the specified Unicode version
+ *
+ * _NORM_SET_INDEX_NX_RESERVED_OFFSET
+ * This is an offset that points to where the next, future set would start.
+ * Currently it indicates where the previous set ends, and thus its length.
+ * The name for this enum constant may in the future be applied to different
+ * index slots. In order to get the limit of a set, use its index slot and
+ * the immediately following one regardless of that one's enum name.
*/
#endif /* #if !UCONFIG_NO_NORMALIZATION */