]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
374ca955 | 4 | * Copyright (C) 2002-2004, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: uprops.h | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2002feb24 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Constants for mostly non-core Unicode character properties | |
17 | * stored in uprops.dat. | |
18 | */ | |
19 | ||
20 | #ifndef __UPROPS_H__ | |
21 | #define __UPROPS_H__ | |
22 | ||
23 | #include "unicode/utypes.h" | |
24 | #include "unicode/uset.h" | |
374ca955 A |
25 | #include "uset_imp.h" |
26 | #include "ucase.h" | |
27 | #include "udataswp.h" | |
b75a7d8f A |
28 | |
29 | /* indexes[] entries */ | |
30 | enum { | |
31 | UPROPS_PROPS32_INDEX, | |
32 | UPROPS_EXCEPTIONS_INDEX, | |
33 | UPROPS_EXCEPTIONS_TOP_INDEX, | |
34 | ||
35 | UPROPS_ADDITIONAL_TRIE_INDEX, | |
36 | UPROPS_ADDITIONAL_VECTORS_INDEX, | |
37 | UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX, | |
38 | ||
39 | UPROPS_RESERVED_INDEX, /* 6 */ | |
40 | ||
41 | /* maximum values for code values in vector word 0 */ | |
42 | UPROPS_MAX_VALUES_INDEX=10, | |
43 | /* maximum values for code values in vector word 2 */ | |
44 | UPROPS_MAX_VALUES_2_INDEX, | |
45 | ||
46 | UPROPS_INDEX_COUNT=16 | |
47 | }; | |
48 | ||
49 | /* definitions for the main properties words */ | |
50 | enum { | |
51 | /* general category shift==0 0 (5 bits) */ | |
52 | UPROPS_EXCEPTION_SHIFT=5, /* 5 (1 bit) */ | |
53 | UPROPS_BIDI_SHIFT, /* 6 (5 bits) */ | |
54 | UPROPS_MIRROR_SHIFT=UPROPS_BIDI_SHIFT+5, /* 11 (1 bit) */ | |
55 | UPROPS_NUMERIC_TYPE_SHIFT, /* 12 (3 bits) */ | |
56 | UPROPS_CASE_SENSITIVE_SHIFT=UPROPS_NUMERIC_TYPE_SHIFT+3,/* 15 (1 bit) format version 3.2 */ | |
57 | UPROPS_RESERVED_SHIFT, /* 16 (4 bits) */ | |
58 | UPROPS_VALUE_SHIFT=20, /* 20 */ | |
59 | ||
60 | UPROPS_EXCEPTION_BIT=1UL<<UPROPS_EXCEPTION_SHIFT, | |
61 | UPROPS_VALUE_BITS=32-UPROPS_VALUE_SHIFT, | |
62 | ||
63 | UPROPS_MIN_VALUE=-(1L<<(UPROPS_VALUE_BITS-1)), | |
64 | UPROPS_MAX_VALUE=(1L<<(UPROPS_VALUE_BITS-1))-1, | |
65 | UPROPS_MAX_EXCEPTIONS_COUNT=1L<<UPROPS_VALUE_BITS | |
66 | }; | |
67 | ||
68 | #define PROPS_VALUE_IS_EXCEPTION(props) ((props)&UPROPS_EXCEPTION_BIT) | |
69 | #define GET_CATEGORY(props) ((props)&0x1f) | |
70 | #define GET_BIDI_CLASS(props) ((props>>UPROPS_BIDI_SHIFT)&0x1f) | |
71 | #define GET_NUMERIC_TYPE(props) (((props)>>UPROPS_NUMERIC_TYPE_SHIFT)&7) | |
72 | #define GET_UNSIGNED_VALUE(props) ((props)>>UPROPS_VALUE_SHIFT) | |
73 | #define GET_SIGNED_VALUE(props) ((int32_t)(props)>>UPROPS_VALUE_SHIFT) | |
74 | #define GET_EXCEPTIONS(props) (exceptionsTable+GET_UNSIGNED_VALUE(props)) | |
75 | ||
76 | #define CAT_MASK(props) U_MASK(GET_CATEGORY(props)) | |
77 | ||
78 | enum { | |
79 | EXC_UPPERCASE, | |
80 | EXC_LOWERCASE, | |
81 | EXC_TITLECASE, | |
82 | EXC_UNUSED, | |
83 | EXC_NUMERIC_VALUE, | |
84 | EXC_DENOMINATOR_VALUE, | |
85 | EXC_MIRROR_MAPPING, | |
86 | EXC_SPECIAL_CASING, | |
87 | EXC_CASE_FOLDING | |
88 | }; | |
89 | ||
90 | /* number of properties vector words */ | |
91 | #define UPROPS_VECTOR_WORDS 3 | |
92 | ||
93 | /* | |
94 | * Properties in vector word 0 | |
95 | * Bits | |
96 | * 31..24 DerivedAge version major/minor one nibble each | |
97 | * 23 reserved | |
98 | * 22..18 Line Break | |
99 | * 17..15 East Asian Width | |
100 | * 14.. 7 UBlockCode | |
101 | * 6.. 0 UScriptCode | |
102 | */ | |
103 | ||
104 | /* derived age: one nibble each for major and minor version numbers */ | |
105 | #define UPROPS_AGE_MASK 0xff000000 | |
106 | #define UPROPS_AGE_SHIFT 24 | |
107 | ||
108 | #define UPROPS_LB_MASK 0x007C0000 | |
109 | #define UPROPS_LB_SHIFT 18 | |
110 | ||
111 | #define UPROPS_EA_MASK 0x00038000 | |
112 | #define UPROPS_EA_SHIFT 15 | |
113 | ||
114 | #define UPROPS_BLOCK_MASK 0x00007f80 | |
115 | #define UPROPS_BLOCK_SHIFT 7 | |
116 | ||
117 | #define UPROPS_SCRIPT_MASK 0x0000007f | |
118 | ||
119 | /* | |
120 | * Properties in vector word 1 | |
121 | * Each bit encodes one binary property. | |
122 | * The following constants represent the bit number, use 1<<UPROPS_XYZ. | |
123 | * UPROPS_BINARY_1_TOP<=32! | |
124 | * | |
125 | * Keep this list of property enums in sync with | |
126 | * propListNames[] in icu/source/tools/genprops/props2.c! | |
127 | * | |
128 | * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". | |
129 | */ | |
130 | enum { | |
131 | UPROPS_WHITE_SPACE, | |
132 | UPROPS_BIDI_CONTROL, | |
133 | UPROPS_JOIN_CONTROL, | |
134 | UPROPS_DASH, | |
135 | UPROPS_HYPHEN, | |
136 | UPROPS_QUOTATION_MARK, | |
137 | UPROPS_TERMINAL_PUNCTUATION, | |
138 | UPROPS_MATH, | |
139 | UPROPS_HEX_DIGIT, | |
140 | UPROPS_ASCII_HEX_DIGIT, | |
141 | UPROPS_ALPHABETIC, | |
142 | UPROPS_IDEOGRAPHIC, | |
143 | UPROPS_DIACRITIC, | |
144 | UPROPS_EXTENDER, | |
145 | UPROPS_LOWERCASE, | |
146 | UPROPS_UPPERCASE, | |
147 | UPROPS_NONCHARACTER_CODE_POINT, | |
148 | UPROPS_GRAPHEME_EXTEND, | |
149 | UPROPS_GRAPHEME_LINK, | |
150 | UPROPS_IDS_BINARY_OPERATOR, | |
151 | UPROPS_IDS_TRINARY_OPERATOR, | |
152 | UPROPS_RADICAL, | |
153 | UPROPS_UNIFIED_IDEOGRAPH, | |
154 | UPROPS_DEFAULT_IGNORABLE_CODE_POINT, | |
155 | UPROPS_DEPRECATED, | |
156 | UPROPS_SOFT_DOTTED, | |
157 | UPROPS_LOGICAL_ORDER_EXCEPTION, | |
158 | UPROPS_XID_START, | |
159 | UPROPS_XID_CONTINUE, | |
160 | UPROPS_ID_START, /* ICU 2.6, uprops format version 3.2 */ | |
161 | UPROPS_ID_CONTINUE, | |
162 | UPROPS_GRAPHEME_BASE, | |
163 | UPROPS_BINARY_1_TOP /* ==32 - full! */ | |
164 | }; | |
165 | ||
166 | /* | |
167 | * Properties in vector word 2 | |
168 | * Bits | |
374ca955 | 169 | * 31..24 More binary properties |
b75a7d8f A |
170 | * 13..11 Joining Type |
171 | * 10.. 5 Joining Group | |
172 | * 4.. 0 Decomposition Type | |
173 | */ | |
174 | #define UPROPS_JT_MASK 0x00003800 | |
175 | #define UPROPS_JT_SHIFT 11 | |
176 | ||
177 | #define UPROPS_JG_MASK 0x000007e0 | |
178 | #define UPROPS_JG_SHIFT 5 | |
179 | ||
180 | #define UPROPS_DT_MASK 0x0000001f | |
181 | ||
374ca955 A |
182 | enum { |
183 | UPROPS_V2_S_TERM=24, /* new in ICU 3.0 and Unicode 4.0.1 */ | |
184 | UPROPS_V2_VARIATION_SELECTOR, | |
185 | UPROPS_V2_TOP /* must be <=32 */ | |
186 | }; | |
187 | ||
b75a7d8f A |
188 | /** |
189 | * Get a properties vector word for a code point. | |
190 | * Implemented in uchar.c for uprops.c. | |
191 | * column==-1 gets the 32-bit main properties word instead. | |
192 | * @return 0 if no data or illegal argument | |
193 | */ | |
194 | U_CFUNC uint32_t | |
195 | u_getUnicodeProperties(UChar32 c, int32_t column); | |
196 | ||
197 | /** | |
198 | * Get the the maximum values for some enum/int properties. | |
199 | * Use the same column numbers as for u_getUnicodeProperties(). | |
200 | * The returned value will contain maximum values stored in the same bit fields | |
201 | * as where the enum values are stored in the u_getUnicodeProperties() | |
202 | * return values for the same columns. | |
203 | * | |
204 | * Valid columns are those for properties words that contain enumerated values. | |
205 | * (ICU 2.6: columns 0 and 2) | |
206 | * For other column numbers, this function will return 0. | |
207 | * | |
208 | * @internal | |
209 | */ | |
210 | U_CFUNC int32_t | |
211 | uprv_getMaxValues(int32_t column); | |
212 | ||
213 | /** | |
374ca955 | 214 | * Get the Hangul Syllable Type for c. |
b75a7d8f A |
215 | * @internal |
216 | */ | |
374ca955 A |
217 | U_CFUNC UHangulSyllableType |
218 | uchar_getHST(UChar32 c); | |
b75a7d8f A |
219 | |
220 | /** Turn a bit index into a bit flag. @internal */ | |
221 | #define FLAG(n) ((uint32_t)1<<(n)) | |
222 | ||
223 | /** Flags for general categories in the order of UCharCategory. @internal */ | |
224 | #define _Cn FLAG(U_GENERAL_OTHER_TYPES) | |
225 | #define _Lu FLAG(U_UPPERCASE_LETTER) | |
226 | #define _Ll FLAG(U_LOWERCASE_LETTER) | |
227 | #define _Lt FLAG(U_TITLECASE_LETTER) | |
228 | #define _Lm FLAG(U_MODIFIER_LETTER) | |
229 | #define _Lo FLAG(U_OTHER_LETTER) | |
230 | #define _Mn FLAG(U_NON_SPACING_MARK) | |
231 | #define _Me FLAG(U_ENCLOSING_MARK) | |
232 | #define _Mc FLAG(U_COMBINING_SPACING_MARK) | |
233 | #define _Nd FLAG(U_DECIMAL_DIGIT_NUMBER) | |
234 | #define _Nl FLAG(U_LETTER_NUMBER) | |
235 | #define _No FLAG(U_OTHER_NUMBER) | |
236 | #define _Zs FLAG(U_SPACE_SEPARATOR) | |
237 | #define _Zl FLAG(U_LINE_SEPARATOR) | |
238 | #define _Zp FLAG(U_PARAGRAPH_SEPARATOR) | |
239 | #define _Cc FLAG(U_CONTROL_CHAR) | |
240 | #define _Cf FLAG(U_FORMAT_CHAR) | |
241 | #define _Co FLAG(U_PRIVATE_USE_CHAR) | |
242 | #define _Cs FLAG(U_SURROGATE) | |
243 | #define _Pd FLAG(U_DASH_PUNCTUATION) | |
244 | #define _Ps FLAG(U_START_PUNCTUATION) | |
245 | #define _Pe FLAG(U_END_PUNCTUATION) | |
246 | #define _Pc FLAG(U_CONNECTOR_PUNCTUATION) | |
247 | #define _Po FLAG(U_OTHER_PUNCTUATION) | |
248 | #define _Sm FLAG(U_MATH_SYMBOL) | |
249 | #define _Sc FLAG(U_CURRENCY_SYMBOL) | |
250 | #define _Sk FLAG(U_MODIFIER_SYMBOL) | |
251 | #define _So FLAG(U_OTHER_SYMBOL) | |
252 | #define _Pi FLAG(U_INITIAL_PUNCTUATION) | |
253 | #define _Pf FLAG(U_FINAL_PUNCTUATION) | |
254 | ||
255 | /** Some code points. @internal */ | |
256 | enum { | |
257 | TAB =0x0009, | |
258 | LF =0x000a, | |
259 | FF =0x000c, | |
260 | CR =0x000d, | |
261 | U_A =0x0041, | |
262 | U_Z =0x005a, | |
263 | U_a =0x0061, | |
264 | U_z =0x007a, | |
265 | DEL =0x007f, | |
266 | NL =0x0085, | |
267 | NBSP =0x00a0, | |
268 | CGJ =0x034f, | |
269 | FIGURESP=0x2007, | |
270 | HAIRSP =0x200a, | |
271 | ZWNJ =0x200c, | |
272 | ZWJ =0x200d, | |
273 | RLM =0x200f, | |
274 | NNBSP =0x202f, | |
275 | WJ =0x2060, | |
276 | INHSWAP =0x206a, | |
277 | NOMDIG =0x206f, | |
278 | ZWNBSP =0xfeff | |
279 | }; | |
280 | ||
b75a7d8f A |
281 | /** |
282 | * Get the maximum length of a (regular/1.0/extended) character name. | |
283 | * @return 0 if no character names available. | |
284 | */ | |
285 | U_CAPI int32_t U_EXPORT2 | |
286 | uprv_getMaxCharNameLength(void); | |
287 | ||
288 | #if 0 | |
289 | /* | |
290 | Currently not used but left for future use. Probably by UnicodeSet. | |
291 | urename.h and unames.c changed accordingly. | |
292 | */ | |
293 | /** | |
294 | * Get the maximum length of an ISO comment. | |
295 | * @return 0 if no ISO comments available. | |
296 | */ | |
297 | U_CAPI int32_t U_EXPORT2 | |
298 | uprv_getMaxISOCommentLength(); | |
299 | #endif | |
300 | ||
301 | /** | |
302 | * Fills set with characters that are used in Unicode character names. | |
303 | * Includes all characters that are used in regular/Unicode 1.0/extended names. | |
304 | * Just empties the set if no character names are available. | |
374ca955 | 305 | * @param sa USetAdder to receive characters. |
b75a7d8f A |
306 | */ |
307 | U_CAPI void U_EXPORT2 | |
374ca955 | 308 | uprv_getCharNameCharacters(USetAdder *sa); |
b75a7d8f A |
309 | |
310 | #if 0 | |
311 | /* | |
312 | Currently not used but left for future use. Probably by UnicodeSet. | |
313 | urename.h and unames.c changed accordingly. | |
314 | */ | |
315 | /** | |
316 | * Fills set with characters that are used in Unicode character names. | |
317 | * Just empties the set if no ISO comments are available. | |
374ca955 | 318 | * @param sa USetAdder to receive characters. |
b75a7d8f A |
319 | */ |
320 | U_CAPI void U_EXPORT2 | |
374ca955 | 321 | uprv_getISOCommentCharacters(USetAdder *sa); |
b75a7d8f A |
322 | */ |
323 | #endif | |
324 | ||
374ca955 A |
325 | /** |
326 | * Constants for which data and implementation files provide which properties. | |
327 | * Used by UnicodeSet for service-specific property enumeration. | |
328 | * @internal | |
329 | */ | |
330 | enum UPropertySource { | |
331 | /** No source, not a supported property. */ | |
332 | UPROPS_SRC_NONE, | |
333 | /** From uchar.c/uprops.icu */ | |
334 | UPROPS_SRC_CHAR, | |
335 | /** Hangul_Syllable_Type, from uchar.c/uprops.icu */ | |
336 | UPROPS_SRC_HST, | |
337 | /** From unames.c/unames.icu */ | |
338 | UPROPS_SRC_NAMES, | |
339 | /** From unorm.cpp/unorm.icu */ | |
340 | UPROPS_SRC_NORM, | |
341 | /** From ucase.c/ucase.icu */ | |
342 | UPROPS_SRC_CASE, | |
343 | /** From ubidi.c/ubidi.icu */ | |
344 | UPROPS_SRC_BIDI, | |
345 | /** One more than the highes UPropertySource (UPROPS_SRC_) constant. */ | |
346 | UPROPS_SRC_COUNT | |
347 | }; | |
348 | typedef enum UPropertySource UPropertySource; | |
349 | ||
350 | /** | |
351 | * @see UPropertySource | |
352 | * @internal | |
353 | */ | |
354 | U_CAPI UPropertySource U_EXPORT2 | |
355 | uprops_getSource(UProperty which); | |
356 | ||
b75a7d8f A |
357 | /** |
358 | * Enumerate each core properties data trie and add the | |
359 | * start of each range of same properties to the set. | |
360 | * @internal | |
361 | */ | |
362 | U_CAPI void U_EXPORT2 | |
374ca955 A |
363 | uchar_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode); |
364 | ||
365 | /** | |
366 | * Same as uchar_addPropertyStarts() but only for Hangul_Syllable_Type. | |
367 | * @internal | |
368 | */ | |
369 | U_CAPI void U_EXPORT2 | |
370 | uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode); | |
b75a7d8f A |
371 | |
372 | /** | |
373 | * Return a set of characters for property enumeration. | |
374 | * For each two consecutive characters (start, limit) in the set, | |
375 | * all of the properties for start..limit-1 are all the same. | |
376 | * | |
374ca955 | 377 | * @param sa USetAdder to receive result. Existing contents are lost. |
b75a7d8f A |
378 | * @internal |
379 | */ | |
380 | U_CAPI void U_EXPORT2 | |
374ca955 A |
381 | uprv_getInclusions(USetAdder *sa, UErrorCode *pErrorCode); |
382 | ||
383 | /** | |
384 | * Swap the ICU Unicode properties file. See uchar.c. | |
385 | * @internal | |
386 | */ | |
387 | U_CAPI int32_t U_EXPORT2 | |
388 | uprops_swap(const UDataSwapper *ds, | |
389 | const void *inData, int32_t length, void *outData, | |
390 | UErrorCode *pErrorCode); | |
391 | ||
392 | /** | |
393 | * Swap the ICU Unicode character names file. See uchar.c. | |
394 | * @internal | |
395 | */ | |
396 | U_CAPI int32_t U_EXPORT2 | |
397 | uchar_swapNames(const UDataSwapper *ds, | |
398 | const void *inData, int32_t length, void *outData, | |
399 | UErrorCode *pErrorCode); | |
b75a7d8f A |
400 | |
401 | #endif |