]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2004, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: ucase.h | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004aug30 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Low-level Unicode character/string case mapping code. | |
17 | */ | |
18 | ||
19 | #ifndef __UCASE_H__ | |
20 | #define __UCASE_H__ | |
21 | ||
22 | #include "unicode/utypes.h" | |
23 | #include "unicode/uset.h" | |
24 | #include "uset_imp.h" | |
25 | #include "udataswp.h" | |
26 | ||
27 | U_CDECL_BEGIN | |
28 | ||
29 | /* library API -------------------------------------------------------------- */ | |
30 | ||
31 | struct UCaseProps; | |
32 | typedef struct UCaseProps UCaseProps; | |
33 | ||
34 | U_CAPI UCaseProps * U_EXPORT2 | |
35 | ucase_open(UErrorCode *pErrorCode); | |
36 | ||
37 | U_CAPI UCaseProps * U_EXPORT2 | |
38 | ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode); | |
39 | ||
40 | U_CAPI void U_EXPORT2 | |
41 | ucase_close(UCaseProps *csp); | |
42 | ||
43 | ||
44 | U_CAPI UCaseProps * U_EXPORT2 | |
45 | ucase_getSingleton(UErrorCode *pErrorCode); | |
46 | ||
47 | ||
48 | U_CAPI int32_t U_EXPORT2 | |
49 | ucase_swap(const UDataSwapper *ds, | |
50 | const void *inData, int32_t length, void *outData, | |
51 | UErrorCode *pErrorCode); | |
52 | ||
53 | U_CAPI void U_EXPORT2 | |
54 | ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode); | |
55 | ||
56 | /** | |
57 | * Bit mask for getting just the options from a string compare options word | |
58 | * that are relevant for case-insensitive string comparison. | |
59 | * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER. | |
60 | * @internal | |
61 | */ | |
62 | #define _STRCASECMP_OPTIONS_MASK 0xffff | |
63 | ||
64 | /** | |
65 | * Bit mask for getting just the options from a string compare options word | |
66 | * that are relevant for case folding (of a single string or code point). | |
67 | * See uchar.h. | |
68 | * @internal | |
69 | */ | |
70 | #define _FOLD_CASE_OPTIONS_MASK 0xff | |
71 | ||
72 | /* single-code point functions */ | |
73 | ||
74 | U_CAPI UChar32 U_EXPORT2 | |
75 | ucase_tolower(const UCaseProps *csp, UChar32 c); | |
76 | ||
77 | U_CAPI UChar32 U_EXPORT2 | |
78 | ucase_toupper(const UCaseProps *csp, UChar32 c); | |
79 | ||
80 | U_CAPI UChar32 U_EXPORT2 | |
81 | ucase_totitle(const UCaseProps *csp, UChar32 c); | |
82 | ||
83 | U_CAPI UChar32 U_EXPORT2 | |
84 | ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options); | |
85 | ||
86 | /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ | |
87 | U_CAPI int32_t U_EXPORT2 | |
88 | ucase_getType(const UCaseProps *csp, UChar32 c); | |
89 | ||
90 | /** @return same as ucase_getType(), or <0 if c is case-ignorable */ | |
91 | U_CAPI int32_t U_EXPORT2 | |
92 | ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c); | |
93 | ||
94 | U_CAPI UBool U_EXPORT2 | |
95 | ucase_isSoftDotted(const UCaseProps *csp, UChar32 c); | |
96 | ||
97 | U_CAPI UBool U_EXPORT2 | |
98 | ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c); | |
99 | ||
100 | /* string case mapping functions */ | |
101 | ||
102 | /** | |
103 | * Iterator function for string case mappings, which need to look at the | |
104 | * context (surrounding text) of a given character for conditional mappings. | |
105 | * | |
106 | * The iterator only needs to go backward or forward away from the | |
107 | * character in question. It does not use any indexes on this interface. | |
108 | * It does not support random access or an arbitrary change of | |
109 | * iteration direction. | |
110 | * | |
111 | * The direction parameter either starts | |
112 | * | |
113 | * @param context A pointer to the iterator's working data. | |
114 | * @param dir If <0 then start iterating backward from the character; | |
115 | * if >0 then start iterating forward from the character; | |
116 | * if 0 then continue iterating in the current direction. | |
117 | * @return Next code point, or <0 when the iteration is done. | |
118 | */ | |
119 | typedef UChar32 U_CALLCONV | |
120 | UCaseContextIterator(void *context, int8_t dir); | |
121 | ||
122 | /** | |
123 | * Sample struct which may be used by some implementations of | |
124 | * UCaseContextIterator. | |
125 | */ | |
126 | struct UCaseContext { | |
127 | void *p; | |
128 | int32_t start, index, limit; | |
129 | int32_t cpStart, cpLimit; | |
130 | int8_t dir; | |
131 | int8_t b1, b2, b3; | |
132 | }; | |
133 | typedef struct UCaseContext UCaseContext; | |
134 | ||
135 | enum { | |
136 | /** | |
137 | * For string case mappings, a single character (a code point) is mapped | |
138 | * either to itself (in which case in-place mapping functions do nothing), | |
139 | * or to another single code point, or to a string. | |
140 | * Aside from the string contents, these are indicated with a single int32_t | |
141 | * value as follows: | |
142 | * | |
143 | * Mapping to self: Negative values (~self instead of -self to support U+0000) | |
144 | * | |
145 | * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH | |
146 | * | |
147 | * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is | |
148 | * returned. Note that the string result may indeed have zero length. | |
149 | */ | |
150 | UCASE_MAX_STRING_LENGTH=0x1f | |
151 | }; | |
152 | ||
153 | /** | |
154 | * Get the full lowercase mapping for c. | |
155 | * | |
156 | * @param csp Case mapping properties. | |
157 | * @param c Character to be mapped. | |
158 | * @param iter Character iterator, used for context-sensitive mappings. | |
159 | * See UCaseContextIterator for details. | |
160 | * If iter==NULL then a context-independent result is returned. | |
161 | * @param context Pointer to be passed into iter. | |
162 | * @param pString If the mapping result is a string, then the pointer is | |
163 | * written to *pString. | |
164 | * @param locale Locale ID for locale-dependent mappings. | |
165 | * @param locCache Initialize to 0; may be used to cache the result of parsing | |
166 | * the locale ID for subsequent calls. | |
167 | * Can be NULL. | |
168 | * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH. | |
169 | * | |
170 | * @see UCaseContextIterator | |
171 | * @see UCASE_MAX_STRING_LENGTH | |
172 | * @internal | |
173 | */ | |
174 | U_CAPI int32_t U_EXPORT2 | |
175 | ucase_toFullLower(const UCaseProps *csp, UChar32 c, | |
176 | UCaseContextIterator *iter, void *context, | |
177 | const UChar **pString, | |
178 | const char *locale, int32_t *locCache); | |
179 | ||
180 | U_CAPI int32_t U_EXPORT2 | |
181 | ucase_toFullUpper(const UCaseProps *csp, UChar32 c, | |
182 | UCaseContextIterator *iter, void *context, | |
183 | const UChar **pString, | |
184 | const char *locale, int32_t *locCache); | |
185 | ||
186 | U_CAPI int32_t U_EXPORT2 | |
187 | ucase_toFullTitle(const UCaseProps *csp, UChar32 c, | |
188 | UCaseContextIterator *iter, void *context, | |
189 | const UChar **pString, | |
190 | const char *locale, int32_t *locCache); | |
191 | ||
192 | U_CAPI int32_t U_EXPORT2 | |
193 | ucase_toFullFolding(const UCaseProps *csp, UChar32 c, | |
194 | const UChar **pString, | |
195 | uint32_t options); | |
196 | ||
197 | /* file definitions --------------------------------------------------------- */ | |
198 | ||
199 | #define UCASE_DATA_NAME "ucase" | |
200 | #define UCASE_DATA_TYPE "icu" | |
201 | ||
202 | /* format "cAsE" */ | |
203 | #define UCASE_FMT_0 0x63 | |
204 | #define UCASE_FMT_1 0x41 | |
205 | #define UCASE_FMT_2 0x53 | |
206 | #define UCASE_FMT_3 0x45 | |
207 | ||
208 | /* indexes into indexes[] */ | |
209 | enum { | |
210 | UCASE_IX_INDEX_TOP, | |
211 | UCASE_IX_LENGTH, | |
212 | UCASE_IX_TRIE_SIZE, | |
213 | UCASE_IX_EXC_LENGTH, | |
214 | ||
215 | UCASE_IX_MAX_FULL_LENGTH=15, | |
216 | UCASE_IX_TOP=16 | |
217 | }; | |
218 | ||
219 | /* definitions for 16-bit case properties word ------------------------------ */ | |
220 | ||
221 | /* 2-bit constants for types of cased characters */ | |
222 | #define UCASE_TYPE_MASK 3 | |
223 | enum { | |
224 | UCASE_NONE, | |
225 | UCASE_LOWER, | |
226 | UCASE_UPPER, | |
227 | UCASE_TITLE | |
228 | }; | |
229 | ||
230 | #define UCASE_SENSITIVE 4 | |
231 | #define UCASE_EXCEPTION 8 | |
232 | ||
233 | #define UCASE_DOT_MASK 0x30 | |
234 | enum { | |
235 | UCASE_NO_DOT=0, /* normal characters with cc=0 */ | |
236 | UCASE_SOFT_DOTTED=0x10, /* soft-dotted characters with cc=0 */ | |
237 | UCASE_ABOVE=0x20, /* "above" accents with cc=230 */ | |
238 | UCASE_OTHER_ACCENT=0x30 /* other accent character (0<cc!=230) */ | |
239 | }; | |
240 | ||
241 | /* no exception: bits 15..6 are a 10-bit signed case mapping delta */ | |
242 | #define UCASE_DELTA_SHIFT 6 | |
243 | #define UCASE_DELTA_MASK 0xffc0 | |
244 | #define UCASE_MAX_DELTA 0x1ff | |
245 | #define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1) | |
246 | ||
247 | #define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT) | |
248 | ||
249 | /* case-ignorable uses one of the delta bits, see gencase/store.c */ | |
250 | #define UCASE_CASE_IGNORABLE 0x40 | |
251 | ||
252 | /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */ | |
253 | #define UCASE_EXC_SHIFT 4 | |
254 | #define UCASE_EXC_MASK 0xfff0 | |
255 | #define UCASE_MAX_EXCEPTIONS 0x1000 | |
256 | ||
257 | /* definitions for 16-bit main exceptions word ------------------------------ */ | |
258 | ||
259 | /* first 8 bits indicate values in optional slots */ | |
260 | enum { | |
261 | UCASE_EXC_LOWER, | |
262 | UCASE_EXC_FOLD, | |
263 | UCASE_EXC_UPPER, | |
264 | UCASE_EXC_TITLE, | |
265 | UCASE_EXC_4, /* reserved */ | |
266 | UCASE_EXC_5, /* reserved */ | |
267 | UCASE_EXC_6, /* reserved */ | |
268 | UCASE_EXC_FULL_MAPPINGS, | |
269 | UCASE_EXC_ALL_SLOTS /* one past the last slot */ | |
270 | }; | |
271 | ||
272 | /* each slot is 2 uint16_t instead of 1 */ | |
273 | #define UCASE_EXC_DOUBLE_SLOTS 0x100 | |
274 | ||
275 | /* reserved: exception bits 11..9 */ | |
276 | ||
277 | /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */ | |
278 | #define UCASE_EXC_DOT_SHIFT 8 | |
279 | ||
280 | /* normally stored in the main word, but pushed out for larger exception indexes */ | |
281 | #define UCASE_EXC_DOT_MASK 0x3000 | |
282 | enum { | |
283 | UCASE_EXC_NO_DOT=0, | |
284 | UCASE_EXC_SOFT_DOTTED=0x1000, | |
285 | UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */ | |
286 | UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */ | |
287 | }; | |
288 | ||
289 | /* complex/conditional mappings */ | |
290 | #define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000 | |
291 | #define UCASE_EXC_CONDITIONAL_FOLD 0x8000 | |
292 | ||
293 | /* definitions for lengths word for full case mappings */ | |
294 | #define UCASE_FULL_LOWER 0xf | |
295 | #define UCASE_FULL_FOLDING 0xf0 | |
296 | #define UCASE_FULL_UPPER 0xf00 | |
297 | #define UCASE_FULL_TITLE 0xf000 | |
298 | ||
299 | U_CDECL_END | |
300 | ||
301 | #endif |