X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/common/ucase.h?ds=sidebyside diff --git a/icuSources/common/ucase.h b/icuSources/common/ucase.h index e3e5be05..b0a453b8 100644 --- a/icuSources/common/ucase.h +++ b/icuSources/common/ucase.h @@ -1,12 +1,14 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * -* Copyright (C) 2004-2006, International Business Machines +* Copyright (C) 2004-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: ucase.h -* encoding: US-ASCII +* encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * @@ -21,45 +23,23 @@ #include "unicode/utypes.h" #include "unicode/uset.h" +#include "putilimp.h" #include "uset_imp.h" #include "udataswp.h" +#include "utrie2.h" -U_CDECL_BEGIN - -/* library API -------------------------------------------------------------- */ - -struct UCaseProps; -typedef struct UCaseProps UCaseProps; - -U_CAPI UCaseProps * U_EXPORT2 -ucase_open(UErrorCode *pErrorCode); - -U_CAPI UCaseProps * U_EXPORT2 -ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode); - -U_CAPI void U_EXPORT2 -ucase_close(UCaseProps *csp); - +#ifdef __cplusplus +U_NAMESPACE_BEGIN -U_CAPI const UCaseProps * U_EXPORT2 -ucase_getSingleton(UErrorCode *pErrorCode); - -/** - * Get a singleton dummy object, one that works with no real data. - * This can be used when the real data is not available. - * Using the dummy can reduce checks for available data after an initial failure. - */ -U_CAPI const UCaseProps * U_EXPORT2 -ucase_getDummy(UErrorCode *pErrorCode); +class UnicodeString; +U_NAMESPACE_END +#endif -U_CAPI int32_t U_EXPORT2 -ucase_swap(const UDataSwapper *ds, - const void *inData, int32_t length, void *outData, - UErrorCode *pErrorCode); +/* library API -------------------------------------------------------------- */ -U_CAPI void U_EXPORT2 -ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode); +U_CFUNC void U_EXPORT2 +ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); /** * Requires non-NULL locale ID but otherwise does the equivalent of @@ -67,12 +47,22 @@ ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode * * Accepts both 2- and 3-letter codes and accepts case variants. */ U_CFUNC int32_t -ucase_getCaseLocale(const char *locale, int32_t *locCache); +ucase_getCaseLocale(const char *locale); + +/* Casing locale types for ucase_getCaseLocale */ +enum { + UCASE_LOC_UNKNOWN, + UCASE_LOC_ROOT, + UCASE_LOC_TURKISH, + UCASE_LOC_LITHUANIAN, + UCASE_LOC_GREEK, + UCASE_LOC_DUTCH +}; /** * Bit mask for getting just the options from a string compare options word * that are relevant for case-insensitive string comparison. - * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER. + * See stringoptions.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER. * @internal */ #define _STRCASECMP_OPTIONS_MASK 0xffff @@ -80,24 +70,30 @@ ucase_getCaseLocale(const char *locale, int32_t *locCache); /** * Bit mask for getting just the options from a string compare options word * that are relevant for case folding (of a single string or code point). - * See uchar.h. + * + * Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I. + * It is conceivable that at some point we might use one more bit for using uppercase sharp s. + * It is conceivable that at some point we might want the option to use only simple case foldings + * when operating on strings. + * + * See stringoptions.h. * @internal */ -#define _FOLD_CASE_OPTIONS_MASK 0xff +#define _FOLD_CASE_OPTIONS_MASK 7 /* single-code point functions */ U_CAPI UChar32 U_EXPORT2 -ucase_tolower(const UCaseProps *csp, UChar32 c); +ucase_tolower(UChar32 c); U_CAPI UChar32 U_EXPORT2 -ucase_toupper(const UCaseProps *csp, UChar32 c); +ucase_toupper(UChar32 c); U_CAPI UChar32 U_EXPORT2 -ucase_totitle(const UCaseProps *csp, UChar32 c); +ucase_totitle(UChar32 c); U_CAPI UChar32 U_EXPORT2 -ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options); +ucase_fold(UChar32 c, uint32_t options); /** * Adds all simple case mappings and the full case folding for c to sa, @@ -108,8 +104,8 @@ ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options); * - for sharp s include ss * - for k include the Kelvin sign */ -U_CAPI void U_EXPORT2 -ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa); +U_CFUNC void U_EXPORT2 +ucase_addCaseClosure(UChar32 c, const USetAdder *sa); /** * Maps the string to single code points and adds the associated case closure @@ -123,25 +119,84 @@ ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa); * * @return TRUE if the string was found */ -U_CAPI UBool U_EXPORT2 -ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa); +U_CFUNC UBool U_EXPORT2 +ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa); + +#ifdef __cplusplus +U_NAMESPACE_BEGIN + +/** + * Iterator over characters with more than one code point in the full default Case_Folding. + */ +class U_COMMON_API FullCaseFoldingIterator { +public: + /** Constructor. */ + FullCaseFoldingIterator(); + /** + * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding. + * Returns a negative cp value at the end of the iteration. + */ + UChar32 next(UnicodeString &full); +private: + FullCaseFoldingIterator(const FullCaseFoldingIterator &); // no copy + FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &); // no assignment + + const UChar *unfold; + int32_t unfoldRows; + int32_t unfoldRowWidth; + int32_t unfoldStringWidth; + int32_t currentRow; + int32_t rowCpIndex; +}; + +/** + * Fast case mapping data for ASCII/Latin. + * Linear arrays of delta bytes: 0=no mapping; EXC=exception. + * Deltas must not cross the ASCII boundary, or else they cannot be easily used + * in simple UTF-8 code. + */ +namespace LatinCase { + +/** Case mapping/folding data for code points up to U+017F. */ +constexpr UChar LIMIT = 0x180; +/** U+017F case-folds and uppercases crossing the ASCII boundary. */ +constexpr UChar LONG_S = 0x17f; +/** Exception: Complex mapping, or too-large delta. */ +constexpr int8_t EXC = -0x80; + +/** Deltas for lowercasing for most locales, and default case folding. */ +extern const int8_t TO_LOWER_NORMAL[LIMIT]; +/** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */ +extern const int8_t TO_LOWER_TR_LT[LIMIT]; + +/** Deltas for uppercasing for most locales. */ +extern const int8_t TO_UPPER_NORMAL[LIMIT]; +/** Deltas for uppercasing for tr/az. */ +extern const int8_t TO_UPPER_TR[LIMIT]; + +} // namespace LatinCase + +U_NAMESPACE_END +#endif /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ U_CAPI int32_t U_EXPORT2 -ucase_getType(const UCaseProps *csp, UChar32 c); +ucase_getType(UChar32 c); -/** @return same as ucase_getType(), or <0 if c is case-ignorable */ +/** @return like ucase_getType() but also sets UCASE_IGNORABLE if c is case-ignorable */ U_CAPI int32_t U_EXPORT2 -ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c); +ucase_getTypeOrIgnorable(UChar32 c); U_CAPI UBool U_EXPORT2 -ucase_isSoftDotted(const UCaseProps *csp, UChar32 c); +ucase_isSoftDotted(UChar32 c); U_CAPI UBool U_EXPORT2 -ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c); +ucase_isCaseSensitive(UChar32 c); /* string case mapping functions */ +U_CDECL_BEGIN + /** * Iterator function for string case mappings, which need to look at the * context (surrounding text) of a given character for conditional mappings. @@ -176,6 +231,10 @@ struct UCaseContext { }; typedef struct UCaseContext UCaseContext; +U_CDECL_END + +#define UCASECONTEXT_INITIALIZER { NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0 } + enum { /** * For string case mappings, a single character (a code point) is mapped @@ -205,10 +264,7 @@ enum { * @param context Pointer to be passed into iter. * @param pString If the mapping result is a string, then the pointer is * written to *pString. - * @param locale Locale ID for locale-dependent mappings. - * @param locCache Initialize to 0; may be used to cache the result of parsing - * the locale ID for subsequent calls. - * Can be NULL. + * @param caseLocale Case locale value from ucase_getCaseLocale(). * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH. * * @see UCaseContextIterator @@ -216,31 +272,45 @@ enum { * @internal */ U_CAPI int32_t U_EXPORT2 -ucase_toFullLower(const UCaseProps *csp, UChar32 c, +ucase_toFullLower(UChar32 c, UCaseContextIterator *iter, void *context, const UChar **pString, - const char *locale, int32_t *locCache); + int32_t caseLocale); U_CAPI int32_t U_EXPORT2 -ucase_toFullUpper(const UCaseProps *csp, UChar32 c, +ucase_toFullUpper(UChar32 c, UCaseContextIterator *iter, void *context, const UChar **pString, - const char *locale, int32_t *locCache); + int32_t caseLocale); U_CAPI int32_t U_EXPORT2 -ucase_toFullTitle(const UCaseProps *csp, UChar32 c, +ucase_toFullTitle(UChar32 c, UCaseContextIterator *iter, void *context, const UChar **pString, - const char *locale, int32_t *locCache); + int32_t caseLocale); U_CAPI int32_t U_EXPORT2 -ucase_toFullFolding(const UCaseProps *csp, UChar32 c, +ucase_toFullFolding(UChar32 c, const UChar **pString, uint32_t options); U_CFUNC int32_t U_EXPORT2 ucase_hasBinaryProperty(UChar32 c, UProperty which); + +U_CDECL_BEGIN + +/** + * @internal + */ +typedef int32_t U_CALLCONV +UCaseMapFull(UChar32 c, + UCaseContextIterator *iter, void *context, + const UChar **pString, + int32_t caseLocale); + +U_CDECL_END + /* file definitions --------------------------------------------------------- */ #define UCASE_DATA_NAME "ucase" @@ -266,6 +336,9 @@ enum { /* definitions for 16-bit case properties word ------------------------------ */ +U_CFUNC const UTrie2 * U_EXPORT2 +ucase_getTrie(); + /* 2-bit constants for types of cased characters */ #define UCASE_TYPE_MASK 3 enum { @@ -276,33 +349,40 @@ enum { }; #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK) +#define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7) + +#define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2) -#define UCASE_SENSITIVE 4 -#define UCASE_EXCEPTION 8 +#define UCASE_IGNORABLE 4 +#define UCASE_EXCEPTION 8 +#define UCASE_SENSITIVE 0x10 -#define UCASE_DOT_MASK 0x30 +#define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) + +#define UCASE_DOT_MASK 0x60 enum { UCASE_NO_DOT=0, /* normal characters with cc=0 */ - UCASE_SOFT_DOTTED=0x10, /* soft-dotted characters with cc=0 */ - UCASE_ABOVE=0x20, /* "above" accents with cc=230 */ - UCASE_OTHER_ACCENT=0x30 /* other accent character (0>UCASE_DELTA_SHIFT) - -/* case-ignorable uses one of the delta bits, see gencase/store.c */ -#define UCASE_CASE_IGNORABLE 0x40 +#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC +# define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT) +#else +# define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT)) +#endif /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */ #define UCASE_EXC_SHIFT 4 #define UCASE_EXC_MASK 0xfff0 -#define UCASE_MAX_EXCEPTIONS 0x1000 +#define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1) /* definitions for 16-bit main exceptions word ------------------------------ */ @@ -312,7 +392,7 @@ enum { UCASE_EXC_FOLD, UCASE_EXC_UPPER, UCASE_EXC_TITLE, - UCASE_EXC_4, /* reserved */ + UCASE_EXC_DELTA, UCASE_EXC_5, /* reserved */ UCASE_EXC_CLOSURE, UCASE_EXC_FULL_MAPPINGS, @@ -322,10 +402,14 @@ enum { /* each slot is 2 uint16_t instead of 1 */ #define UCASE_EXC_DOUBLE_SLOTS 0x100 -/* reserved: exception bits 11..9 */ +enum { + UCASE_EXC_NO_SIMPLE_CASE_FOLDING=0x200, + UCASE_EXC_DELTA_IS_NEGATIVE=0x400, + UCASE_EXC_SENSITIVE=0x800 +}; /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<