2 *******************************************************************************
4 * Copyright (C) 2002-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2002feb24
14 * created by: Markus W. Scherer
16 * Implementations for mostly non-core Unicode character properties
17 * stored in uprops.icu.
19 * With the APIs implemented here, almost all properties files and
20 * their associated implementation files are used from this file,
21 * including those for normalization and case mappings.
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/uscript.h"
31 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
33 /* API functions ------------------------------------------------------------ */
38 } binProps
[UCHAR_BINARY_LIMIT
]={
40 * column and mask values for binary properties from u_getUnicodeProperties().
41 * Must be in order of corresponding UProperty,
42 * and there must be exacly one entry per binary UProperty.
44 * Properties with mask 0 are handled in code.
45 * For them, column is the UPropertySource value.
47 { 1, U_MASK(UPROPS_ALPHABETIC
) },
48 { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT
) },
49 { 1, U_MASK(UPROPS_BIDI_CONTROL
) },
50 { -1, U_MASK(UPROPS_MIRROR_SHIFT
) },
51 { 1, U_MASK(UPROPS_DASH
) },
52 { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT
) },
53 { 1, U_MASK(UPROPS_DEPRECATED
) },
54 { 1, U_MASK(UPROPS_DIACRITIC
) },
55 { 1, U_MASK(UPROPS_EXTENDER
) },
56 { UPROPS_SRC_NORM
, 0 }, /* UCHAR_FULL_COMPOSITION_EXCLUSION */
57 { 1, U_MASK(UPROPS_GRAPHEME_BASE
) },
58 { 1, U_MASK(UPROPS_GRAPHEME_EXTEND
) },
59 { 1, U_MASK(UPROPS_GRAPHEME_LINK
) },
60 { 1, U_MASK(UPROPS_HEX_DIGIT
) },
61 { 1, U_MASK(UPROPS_HYPHEN
) },
62 { 1, U_MASK(UPROPS_ID_CONTINUE
) },
63 { 1, U_MASK(UPROPS_ID_START
) },
64 { 1, U_MASK(UPROPS_IDEOGRAPHIC
) },
65 { 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR
) },
66 { 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR
) },
67 { 1, U_MASK(UPROPS_JOIN_CONTROL
) },
68 { 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION
) },
69 { UPROPS_SRC_CASE
, 0 }, /* UCHAR_LOWERCASE */
70 { 1, U_MASK(UPROPS_MATH
) },
71 { 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT
) },
72 { 1, U_MASK(UPROPS_QUOTATION_MARK
) },
73 { 1, U_MASK(UPROPS_RADICAL
) },
74 { UPROPS_SRC_CASE
, 0 }, /* UCHAR_SOFT_DOTTED */
75 { 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION
) },
76 { 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH
) },
77 { UPROPS_SRC_CASE
, 0 }, /* UCHAR_UPPERCASE */
78 { 1, U_MASK(UPROPS_WHITE_SPACE
) },
79 { 1, U_MASK(UPROPS_XID_CONTINUE
) },
80 { 1, U_MASK(UPROPS_XID_START
) },
81 { UPROPS_SRC_CASE
, 0 }, /* UCHAR_CASE_SENSITIVE */
82 { 2, U_MASK(UPROPS_V2_S_TERM
) },
83 { 2, U_MASK(UPROPS_V2_VARIATION_SELECTOR
) },
84 { UPROPS_SRC_NORM
, 0 }, /* UCHAR_NFD_INERT */
85 { UPROPS_SRC_NORM
, 0 }, /* UCHAR_NFKD_INERT */
86 { UPROPS_SRC_NORM
, 0 }, /* UCHAR_NFC_INERT */
87 { UPROPS_SRC_NORM
, 0 }, /* UCHAR_NFKC_INERT */
88 { UPROPS_SRC_NORM
, 0 } /* UCHAR_SEGMENT_STARTER */
91 U_CAPI UBool U_EXPORT2
92 u_hasBinaryProperty(UChar32 c
, UProperty which
) {
93 /* c is range-checked in the functions that are called from here */
94 if(which
<UCHAR_BINARY_START
|| UCHAR_BINARY_LIMIT
<=which
) {
95 /* not a known binary property */
97 uint32_t mask
=binProps
[which
].mask
;
98 int32_t column
=binProps
[which
].column
;
100 /* systematic, directly stored properties */
101 return (u_getUnicodeProperties(c
, column
)&mask
)!=0;
103 if(column
==UPROPS_SRC_CASE
) {
104 /* case mapping properties */
105 UErrorCode errorCode
=U_ZERO_ERROR
;
106 UCaseProps
*csp
=ucase_getSingleton(&errorCode
);
107 if(U_FAILURE(errorCode
)) {
111 case UCHAR_LOWERCASE
:
112 return (UBool
)(UCASE_LOWER
==ucase_getType(csp
, c
));
113 case UCHAR_UPPERCASE
:
114 return (UBool
)(UCASE_UPPER
==ucase_getType(csp
, c
));
115 case UCHAR_SOFT_DOTTED
:
116 return ucase_isSoftDotted(csp
, c
);
117 case UCHAR_CASE_SENSITIVE
:
118 return ucase_isCaseSensitive(csp
, c
);
122 } else if(column
==UPROPS_SRC_NORM
) {
123 #if !UCONFIG_NO_NORMALIZATION
124 /* normalization properties from unorm.icu */
126 case UCHAR_FULL_COMPOSITION_EXCLUSION
:
127 return unorm_internalIsFullCompositionExclusion(c
);
128 case UCHAR_NFD_INERT
:
129 case UCHAR_NFKD_INERT
:
130 case UCHAR_NFC_INERT
:
131 case UCHAR_NFKC_INERT
:
132 return unorm_isNFSkippable(c
, (UNormalizationMode
)(which
-UCHAR_NFD_INERT
)+UNORM_NFD
);
133 case UCHAR_SEGMENT_STARTER
:
134 return unorm_isCanonSafeStart(c
);
145 U_CAPI
int32_t U_EXPORT2
146 u_getIntPropertyValue(UChar32 c
, UProperty which
) {
147 UErrorCode errorCode
;
149 if(which
<UCHAR_BINARY_START
) {
150 return 0; /* undefined */
151 } else if(which
<UCHAR_BINARY_LIMIT
) {
152 return (int32_t)u_hasBinaryProperty(c
, which
);
153 } else if(which
<UCHAR_INT_START
) {
154 return 0; /* undefined */
155 } else if(which
<UCHAR_INT_LIMIT
) {
157 case UCHAR_BIDI_CLASS
:
158 return (int32_t)u_charDirection(c
);
160 return (int32_t)ublock_getCode(c
);
161 case UCHAR_CANONICAL_COMBINING_CLASS
:
162 #if !UCONFIG_NO_NORMALIZATION
163 return u_getCombiningClass(c
);
167 case UCHAR_DECOMPOSITION_TYPE
:
168 return (int32_t)(u_getUnicodeProperties(c
, 2)&UPROPS_DT_MASK
);
169 case UCHAR_EAST_ASIAN_WIDTH
:
170 return (int32_t)(u_getUnicodeProperties(c
, 0)&UPROPS_EA_MASK
)>>UPROPS_EA_SHIFT
;
171 case UCHAR_GENERAL_CATEGORY
:
172 return (int32_t)u_charType(c
);
173 case UCHAR_JOINING_GROUP
:
174 return (int32_t)(u_getUnicodeProperties(c
, 2)&UPROPS_JG_MASK
)>>UPROPS_JG_SHIFT
;
175 case UCHAR_JOINING_TYPE
:
176 return (int32_t)(u_getUnicodeProperties(c
, 2)&UPROPS_JT_MASK
)>>UPROPS_JT_SHIFT
;
177 case UCHAR_LINE_BREAK
:
178 return (int32_t)(u_getUnicodeProperties(c
, 0)&UPROPS_LB_MASK
)>>UPROPS_LB_SHIFT
;
179 case UCHAR_NUMERIC_TYPE
:
180 return (int32_t)GET_NUMERIC_TYPE(u_getUnicodeProperties(c
, -1));
182 errorCode
=U_ZERO_ERROR
;
183 return (int32_t)uscript_getScript(c
, &errorCode
);
184 case UCHAR_HANGUL_SYLLABLE_TYPE
:
185 return uchar_getHST(c
);
186 #if !UCONFIG_NO_NORMALIZATION
187 case UCHAR_NFD_QUICK_CHECK
:
188 case UCHAR_NFKD_QUICK_CHECK
:
189 case UCHAR_NFC_QUICK_CHECK
:
190 case UCHAR_NFKC_QUICK_CHECK
:
191 return (int32_t)unorm_getQuickCheck(c
, (UNormalizationMode
)(which
-UCHAR_NFD_QUICK_CHECK
)+UNORM_NFD
);
192 case UCHAR_LEAD_CANONICAL_COMBINING_CLASS
:
193 return unorm_getFCD16FromCodePoint(c
)>>8;
194 case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS
:
195 return unorm_getFCD16FromCodePoint(c
)&0xff;
198 return 0; /* undefined */
200 } else if(which
==UCHAR_GENERAL_CATEGORY_MASK
) {
201 return U_MASK(u_charType(c
));
203 return 0; /* undefined */
207 U_CAPI
int32_t U_EXPORT2
208 u_getIntPropertyMinValue(UProperty which
) {
209 return 0; /* all binary/enum/int properties have a minimum value of 0 */
212 U_CAPI
int32_t U_EXPORT2
213 u_getIntPropertyMaxValue(UProperty which
) {
216 if(which
<UCHAR_BINARY_START
) {
217 return -1; /* undefined */
218 } else if(which
<UCHAR_BINARY_LIMIT
) {
219 return 1; /* maximum TRUE for all binary properties */
220 } else if(which
<UCHAR_INT_START
) {
221 return -1; /* undefined */
222 } else if(which
<UCHAR_INT_LIMIT
) {
224 case UCHAR_BIDI_CLASS
:
225 return (int32_t)U_CHAR_DIRECTION_COUNT
-1;
227 max
=(uprv_getMaxValues(0)&UPROPS_BLOCK_MASK
)>>UPROPS_BLOCK_SHIFT
;
228 return max
!=0 ? max
: (int32_t)UBLOCK_COUNT
-1;
229 case UCHAR_CANONICAL_COMBINING_CLASS
:
230 case UCHAR_LEAD_CANONICAL_COMBINING_CLASS
:
231 case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS
:
232 return 0xff; /* TODO do we need to be more precise, getting the actual maximum? */
233 case UCHAR_DECOMPOSITION_TYPE
:
234 max
=uprv_getMaxValues(2)&UPROPS_DT_MASK
;
235 return max
!=0 ? max
: (int32_t)U_DT_COUNT
-1;
236 case UCHAR_EAST_ASIAN_WIDTH
:
237 max
=(uprv_getMaxValues(0)&UPROPS_EA_MASK
)>>UPROPS_EA_SHIFT
;
238 return max
!=0 ? max
: (int32_t)U_EA_COUNT
-1;
239 case UCHAR_GENERAL_CATEGORY
:
240 return (int32_t)U_CHAR_CATEGORY_COUNT
-1;
241 case UCHAR_JOINING_GROUP
:
242 max
=(uprv_getMaxValues(2)&UPROPS_JG_MASK
)>>UPROPS_JG_SHIFT
;
243 return max
!=0 ? max
: (int32_t)U_JG_COUNT
-1;
244 case UCHAR_JOINING_TYPE
:
245 max
=(uprv_getMaxValues(2)&UPROPS_JT_MASK
)>>UPROPS_JT_SHIFT
;
246 return max
!=0 ? max
: (int32_t)U_JT_COUNT
-1;
247 case UCHAR_LINE_BREAK
:
248 max
=(uprv_getMaxValues(0)&UPROPS_LB_MASK
)>>UPROPS_LB_SHIFT
;
249 return max
!=0 ? max
: (int32_t)U_LB_COUNT
-1;
250 case UCHAR_NUMERIC_TYPE
:
251 return (int32_t)U_NT_COUNT
-1;
253 max
=uprv_getMaxValues(0)&UPROPS_SCRIPT_MASK
;
254 return max
!=0 ? max
: (int32_t)USCRIPT_CODE_LIMIT
-1;
255 case UCHAR_HANGUL_SYLLABLE_TYPE
:
256 return (int32_t)U_HST_COUNT
-1;
257 #if !UCONFIG_NO_NORMALIZATION
258 case UCHAR_NFD_QUICK_CHECK
:
259 case UCHAR_NFKD_QUICK_CHECK
:
260 return (int32_t)UNORM_YES
; /* these are never "maybe", only "no" or "yes" */
261 case UCHAR_NFC_QUICK_CHECK
:
262 case UCHAR_NFKC_QUICK_CHECK
:
263 return (int32_t)UNORM_MAYBE
;
266 return -1; /* undefined */
269 return -1; /* undefined */
273 U_CAPI UPropertySource U_EXPORT2
274 uprops_getSource(UProperty which
) {
275 if(which
<UCHAR_BINARY_START
) {
276 return UPROPS_SRC_NONE
; /* undefined */
277 } else if(which
<UCHAR_BINARY_LIMIT
) {
278 if(binProps
[which
].mask
!=0) {
279 return UPROPS_SRC_CHAR
;
281 return (UPropertySource
)binProps
[which
].column
;
283 } else if(which
<UCHAR_INT_START
) {
284 return UPROPS_SRC_NONE
; /* undefined */
285 } else if(which
<UCHAR_INT_LIMIT
) {
287 case UCHAR_HANGUL_SYLLABLE_TYPE
:
288 return UPROPS_SRC_HST
;
289 case UCHAR_CANONICAL_COMBINING_CLASS
:
290 case UCHAR_NFD_QUICK_CHECK
:
291 case UCHAR_NFKD_QUICK_CHECK
:
292 case UCHAR_NFC_QUICK_CHECK
:
293 case UCHAR_NFKC_QUICK_CHECK
:
294 case UCHAR_LEAD_CANONICAL_COMBINING_CLASS
:
295 case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS
:
296 return UPROPS_SRC_NORM
;
298 return UPROPS_SRC_CHAR
;
300 } else if(which
==UCHAR_GENERAL_CATEGORY_MASK
) {
301 return UPROPS_SRC_CHAR
;
303 return UPROPS_SRC_NONE
; /* undefined */
307 /*----------------------------------------------------------------
309 *----------------------------------------------------------------*/
312 * Return a set of characters for property enumeration.
313 * The set implicitly contains 0x110000 as well, which is one more than the highest
314 * Unicode code point.
316 * This set is used as an ordered list - its code points are ordered, and
317 * consecutive code points (in Unicode code point order) in the set define a range.
318 * For each two consecutive characters (start, limit) in the set,
319 * all of the UCD/normalization and related properties for
320 * all code points start..limit-1 are all the same,
321 * except for character names and ISO comments.
323 * All Unicode code points U+0000..U+10ffff are covered by these ranges.
324 * The ranges define a partition of the Unicode code space.
325 * ICU uses the inclusions set to enumerate properties for generating
326 * UnicodeSets containing all code points that have a certain property value.
328 * The Inclusion List is generated from the UCD. It is generated
329 * by enumerating the data tries, and code points for hardcoded properties
332 * --------------------------------------------------------------------------
334 * The following are ideas for getting properties-unique code point ranges,
335 * with possible optimizations beyond the current implementation.
336 * These optimizations would require more code and be more fragile.
337 * The current implementation generates one single list (set) for all properties.
339 * To enumerate properties efficiently, one needs to know ranges of
340 * repetitive values, so that the value of only each start code point
341 * can be applied to the whole range.
342 * This information is in principle available in the uprops.icu/unorm.icu data.
344 * There are two obstacles:
346 * 1. Some properties are computed from multiple data structures,
347 * making it necessary to get repetitive ranges by intersecting
348 * ranges from multiple tries.
350 * 2. It is not economical to write code for getting repetitive ranges
351 * that are precise for each of some 50 properties.
355 * - Get ranges per trie, not per individual property.
356 * Each range contains the same values for a whole group of properties.
357 * This would generate currently five range sets, two for uprops.icu tries
358 * and three for unorm.icu tries.
360 * - Combine sets of ranges for multiple tries to get sufficient sets
361 * for properties, e.g., the uprops.icu main and auxiliary tries
362 * for all non-normalization properties.
364 * Ideas for representing ranges and combining them:
366 * - A UnicodeSet could hold just the start code points of ranges.
367 * Multiple sets are easily combined by or-ing them together.
369 * - Alternatively, a UnicodeSet could hold each even-numbered range.
370 * All ranges could be enumerated by using each start code point
371 * (for the even-numbered ranges) as well as each limit (end+1) code point
372 * (for the odd-numbered ranges).
373 * It should be possible to combine two such sets by xor-ing them,
374 * but no more than two.
376 * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
377 * but the first one is certainly simpler and applicable for combining more than
380 * It is possible to combine all range sets for all uprops/unorm tries into one
381 * set that can be used for all properties.
382 * As an optimization, there could be less-combined range sets for certain
383 * groups of properties.
384 * The relationship of which less-combined range set to use for which property
385 * depends on the implementation of the properties and must be hardcoded
386 * - somewhat error-prone and higher maintenance but can be tested easily
387 * by building property sets "the simple way" in test code.
391 * Do not use a UnicodeSet pattern because that causes infinite recursion;
392 * UnicodeSet depends on the inclusions set.
396 * uprv_getInclusions() is commented out starting 2004-sep-13 because
397 * uniset_props.cpp now calls the uxyz_addPropertyStarts() directly,
398 * and only for the relevant property source.
402 U_CAPI
void U_EXPORT2
403 uprv_getInclusions(USetAdder
*sa
, UErrorCode
*pErrorCode
) {
404 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
408 #if !UCONFIG_NO_NORMALIZATION
409 unorm_addPropertyStarts(sa
, pErrorCode
);
411 uchar_addPropertyStarts(sa
, pErrorCode
);
412 ucase_addPropertyStarts(ucase_getSingleton(pErrorCode
), sa
, pErrorCode
);