]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
2ca993e8 | 4 | * Copyright (C) 2002-2016, International Business Machines |
729e4ab9 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: uprops.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2002feb24 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Implementations for mostly non-core Unicode character properties | |
17 | * stored in uprops.icu. | |
18 | * | |
19 | * With the APIs implemented here, almost all properties files and | |
20 | * their associated implementation files are used from this file, | |
21 | * including those for normalization and case mappings. | |
22 | */ | |
23 | ||
24 | #include "unicode/utypes.h" | |
25 | #include "unicode/uchar.h" | |
26 | #include "unicode/unorm2.h" | |
27 | #include "unicode/uscript.h" | |
28 | #include "unicode/ustring.h" | |
29 | #include "cstring.h" | |
30 | #include "normalizer2impl.h" | |
729e4ab9 A |
31 | #include "umutex.h" |
32 | #include "ubidi_props.h" | |
33 | #include "uprops.h" | |
34 | #include "ucase.h" | |
35 | #include "ustr_imp.h" | |
36 | ||
729e4ab9 A |
37 | U_NAMESPACE_USE |
38 | ||
39 | #define GET_BIDI_PROPS() ubidi_getSingleton() | |
40 | ||
41 | /* general properties API functions ----------------------------------------- */ | |
42 | ||
43 | struct BinaryProperty; | |
44 | ||
45 | typedef UBool BinaryPropertyContains(const BinaryProperty &prop, UChar32 c, UProperty which); | |
46 | ||
47 | struct BinaryProperty { | |
48 | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 | |
49 | uint32_t mask; | |
50 | BinaryPropertyContains *contains; | |
51 | }; | |
52 | ||
53 | static UBool defaultContains(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) { | |
54 | /* systematic, directly stored properties */ | |
55 | return (u_getUnicodeProperties(c, prop.column)&prop.mask)!=0; | |
56 | } | |
57 | ||
58 | static UBool caseBinaryPropertyContains(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { | |
59 | return ucase_hasBinaryProperty(c, which); | |
60 | } | |
61 | ||
62 | static UBool isBidiControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
63 | return ubidi_isBidiControl(GET_BIDI_PROPS(), c); | |
64 | } | |
65 | ||
66 | static UBool isMirrored(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
67 | return ubidi_isMirrored(GET_BIDI_PROPS(), c); | |
68 | } | |
69 | ||
70 | static UBool isJoinControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
71 | return ubidi_isJoinControl(GET_BIDI_PROPS(), c); | |
72 | } | |
73 | ||
74 | #if UCONFIG_NO_NORMALIZATION | |
75 | static UBool hasFullCompositionExclusion(const BinaryProperty &, UChar32, UProperty) { | |
76 | return FALSE; | |
77 | } | |
78 | #else | |
79 | static UBool hasFullCompositionExclusion(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
80 | // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. | |
81 | UErrorCode errorCode=U_ZERO_ERROR; | |
82 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); | |
83 | return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c)); | |
84 | } | |
85 | #endif | |
86 | ||
87 | // UCHAR_NF*_INERT properties | |
88 | #if UCONFIG_NO_NORMALIZATION | |
89 | static UBool isNormInert(const BinaryProperty &, UChar32, UProperty) { | |
90 | return FALSE; | |
91 | } | |
92 | #else | |
93 | static UBool isNormInert(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { | |
94 | UErrorCode errorCode=U_ZERO_ERROR; | |
95 | const Normalizer2 *norm2=Normalizer2Factory::getInstance( | |
96 | (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode); | |
97 | return U_SUCCESS(errorCode) && norm2->isInert(c); | |
98 | } | |
99 | #endif | |
100 | ||
101 | #if UCONFIG_NO_NORMALIZATION | |
102 | static UBool changesWhenCasefolded(const BinaryProperty &, UChar32, UProperty) { | |
103 | return FALSE; | |
104 | } | |
105 | #else | |
106 | static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
107 | UnicodeString nfd; | |
108 | UErrorCode errorCode=U_ZERO_ERROR; | |
b331163b | 109 | const Normalizer2 *nfcNorm2=Normalizer2::getNFCInstance(errorCode); |
729e4ab9 A |
110 | if(U_FAILURE(errorCode)) { |
111 | return FALSE; | |
112 | } | |
113 | if(nfcNorm2->getDecomposition(c, nfd)) { | |
114 | /* c has a decomposition */ | |
115 | if(nfd.length()==1) { | |
116 | c=nfd[0]; /* single BMP code point */ | |
117 | } else if(nfd.length()<=U16_MAX_LENGTH && | |
118 | nfd.length()==U16_LENGTH(c=nfd.char32At(0)) | |
119 | ) { | |
120 | /* single supplementary code point */ | |
121 | } else { | |
122 | c=U_SENTINEL; | |
123 | } | |
124 | } else if(c<0) { | |
125 | return FALSE; /* protect against bad input */ | |
126 | } | |
127 | if(c>=0) { | |
128 | /* single code point */ | |
129 | const UCaseProps *csp=ucase_getSingleton(); | |
130 | const UChar *resultString; | |
131 | return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0); | |
132 | } else { | |
133 | /* guess some large but stack-friendly capacity */ | |
134 | UChar dest[2*UCASE_MAX_STRING_LENGTH]; | |
135 | int32_t destLength; | |
b331163b | 136 | destLength=u_strFoldCase(dest, UPRV_LENGTHOF(dest), |
729e4ab9 A |
137 | nfd.getBuffer(), nfd.length(), |
138 | U_FOLD_CASE_DEFAULT, &errorCode); | |
139 | return (UBool)(U_SUCCESS(errorCode) && | |
140 | 0!=u_strCompare(nfd.getBuffer(), nfd.length(), | |
141 | dest, destLength, FALSE)); | |
142 | } | |
143 | } | |
144 | #endif | |
145 | ||
146 | #if UCONFIG_NO_NORMALIZATION | |
147 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &, UChar32, UProperty) { | |
148 | return FALSE; | |
149 | } | |
150 | #else | |
151 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
152 | UErrorCode errorCode=U_ZERO_ERROR; | |
153 | const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode); | |
154 | if(U_FAILURE(errorCode)) { | |
155 | return FALSE; | |
156 | } | |
157 | UnicodeString src(c); | |
158 | UnicodeString dest; | |
159 | { | |
160 | // The ReorderingBuffer must be in a block because its destructor | |
161 | // needs to release dest's buffer before we look at its contents. | |
162 | ReorderingBuffer buffer(*kcf, dest); | |
163 | // Small destCapacity for NFKC_CF(c). | |
164 | if(buffer.init(5, errorCode)) { | |
165 | const UChar *srcArray=src.getBuffer(); | |
166 | kcf->compose(srcArray, srcArray+src.length(), FALSE, | |
167 | TRUE, buffer, errorCode); | |
168 | } | |
169 | } | |
170 | return U_SUCCESS(errorCode) && dest!=src; | |
171 | } | |
172 | #endif | |
173 | ||
174 | #if UCONFIG_NO_NORMALIZATION | |
175 | static UBool isCanonSegmentStarter(const BinaryProperty &, UChar32, UProperty) { | |
176 | return FALSE; | |
177 | } | |
178 | #else | |
179 | static UBool isCanonSegmentStarter(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
180 | UErrorCode errorCode=U_ZERO_ERROR; | |
181 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); | |
182 | return | |
183 | U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) && | |
184 | impl->isCanonSegmentStarter(c); | |
185 | } | |
186 | #endif | |
187 | ||
188 | static UBool isPOSIX_alnum(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
189 | return u_isalnumPOSIX(c); | |
190 | } | |
191 | ||
192 | static UBool isPOSIX_blank(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
193 | return u_isblank(c); | |
194 | } | |
195 | ||
196 | static UBool isPOSIX_graph(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
197 | return u_isgraphPOSIX(c); | |
198 | } | |
199 | ||
200 | static UBool isPOSIX_print(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
201 | return u_isprintPOSIX(c); | |
202 | } | |
203 | ||
204 | static UBool isPOSIX_xdigit(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
205 | return u_isxdigit(c); | |
206 | } | |
207 | ||
208 | static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={ | |
209 | /* | |
210 | * column and mask values for binary properties from u_getUnicodeProperties(). | |
211 | * Must be in order of corresponding UProperty, | |
212 | * and there must be exactly one entry per binary UProperty. | |
213 | * | |
4388f060 | 214 | * Properties with mask==0 are handled in code. |
729e4ab9 A |
215 | * For them, column is the UPropertySource value. |
216 | */ | |
217 | { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains }, | |
218 | { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains }, | |
219 | { UPROPS_SRC_BIDI, 0, isBidiControl }, | |
220 | { UPROPS_SRC_BIDI, 0, isMirrored }, | |
221 | { 1, U_MASK(UPROPS_DASH), defaultContains }, | |
222 | { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains }, | |
223 | { 1, U_MASK(UPROPS_DEPRECATED), defaultContains }, | |
224 | { 1, U_MASK(UPROPS_DIACRITIC), defaultContains }, | |
225 | { 1, U_MASK(UPROPS_EXTENDER), defaultContains }, | |
226 | { UPROPS_SRC_NFC, 0, hasFullCompositionExclusion }, | |
227 | { 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains }, | |
228 | { 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains }, | |
229 | { 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains }, | |
230 | { 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains }, | |
231 | { 1, U_MASK(UPROPS_HYPHEN), defaultContains }, | |
232 | { 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains }, | |
233 | { 1, U_MASK(UPROPS_ID_START), defaultContains }, | |
234 | { 1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains }, | |
235 | { 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains }, | |
236 | { 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains }, | |
237 | { UPROPS_SRC_BIDI, 0, isJoinControl }, | |
238 | { 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains }, | |
239 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_LOWERCASE | |
240 | { 1, U_MASK(UPROPS_MATH), defaultContains }, | |
241 | { 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains }, | |
242 | { 1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains }, | |
243 | { 1, U_MASK(UPROPS_RADICAL), defaultContains }, | |
244 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_SOFT_DOTTED | |
245 | { 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains }, | |
246 | { 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains }, | |
247 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_UPPERCASE | |
248 | { 1, U_MASK(UPROPS_WHITE_SPACE), defaultContains }, | |
249 | { 1, U_MASK(UPROPS_XID_CONTINUE), defaultContains }, | |
250 | { 1, U_MASK(UPROPS_XID_START), defaultContains }, | |
251 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_SENSITIVE | |
252 | { 1, U_MASK(UPROPS_S_TERM), defaultContains }, | |
253 | { 1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains }, | |
254 | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFD_INERT | |
255 | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKD_INERT | |
256 | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFC_INERT | |
257 | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKC_INERT | |
258 | { UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter }, | |
259 | { 1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains }, | |
260 | { 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains }, | |
261 | { UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum }, | |
262 | { UPROPS_SRC_CHAR, 0, isPOSIX_blank }, | |
263 | { UPROPS_SRC_CHAR, 0, isPOSIX_graph }, | |
264 | { UPROPS_SRC_CHAR, 0, isPOSIX_print }, | |
265 | { UPROPS_SRC_CHAR, 0, isPOSIX_xdigit }, | |
266 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASED | |
267 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_IGNORABLE | |
268 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_LOWERCASED | |
269 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_UPPERCASED | |
270 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_TITLECASED | |
271 | { UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded }, | |
272 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED | |
2ca993e8 A |
273 | { UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded }, |
274 | { 2, U_MASK(UPROPS_2_EMOJI), defaultContains }, | |
275 | { 2, U_MASK(UPROPS_2_EMOJI_PRESENTATION), defaultContains }, | |
276 | { 2, U_MASK(UPROPS_2_EMOJI_MODIFIER), defaultContains }, | |
277 | { 2, U_MASK(UPROPS_2_EMOJI_MODIFIER_BASE), defaultContains }, | |
729e4ab9 A |
278 | }; |
279 | ||
280 | U_CAPI UBool U_EXPORT2 | |
281 | u_hasBinaryProperty(UChar32 c, UProperty which) { | |
282 | /* c is range-checked in the functions that are called from here */ | |
283 | if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) { | |
284 | /* not a known binary property */ | |
285 | return FALSE; | |
286 | } else { | |
287 | const BinaryProperty &prop=binProps[which]; | |
288 | return prop.contains(prop, c, which); | |
289 | } | |
290 | } | |
291 | ||
729e4ab9 A |
292 | struct IntProperty; |
293 | ||
294 | typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which); | |
295 | typedef int32_t IntPropertyGetMaxValue(const IntProperty &prop, UProperty which); | |
296 | ||
297 | struct IntProperty { | |
298 | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 | |
299 | uint32_t mask; | |
300 | int32_t shift; // =maxValue if getMaxValueFromShift() is used | |
301 | IntPropertyGetValue *getValue; | |
302 | IntPropertyGetMaxValue *getMaxValue; | |
303 | }; | |
304 | ||
305 | static int32_t defaultGetValue(const IntProperty &prop, UChar32 c, UProperty /*which*/) { | |
306 | /* systematic, directly stored properties */ | |
307 | return (int32_t)(u_getUnicodeProperties(c, prop.column)&prop.mask)>>prop.shift; | |
308 | } | |
309 | ||
310 | static int32_t defaultGetMaxValue(const IntProperty &prop, UProperty /*which*/) { | |
311 | return (uprv_getMaxValues(prop.column)&prop.mask)>>prop.shift; | |
312 | } | |
313 | ||
314 | static int32_t getMaxValueFromShift(const IntProperty &prop, UProperty /*which*/) { | |
315 | return prop.shift; | |
316 | } | |
317 | ||
318 | static int32_t getBiDiClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
319 | return (int32_t)u_charDirection(c); | |
320 | } | |
321 | ||
57a6839d A |
322 | static int32_t getBiDiPairedBracketType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
323 | return (int32_t)ubidi_getPairedBracketType(GET_BIDI_PROPS(), c); | |
324 | } | |
325 | ||
729e4ab9 A |
326 | static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) { |
327 | return ubidi_getMaxValue(GET_BIDI_PROPS(), which); | |
328 | } | |
329 | ||
330 | #if UCONFIG_NO_NORMALIZATION | |
331 | static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) { | |
332 | return 0; | |
333 | } | |
334 | #else | |
335 | static int32_t getCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
336 | return u_getCombiningClass(c); | |
337 | } | |
338 | #endif | |
339 | ||
340 | static int32_t getGeneralCategory(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
341 | return (int32_t)u_charType(c); | |
342 | } | |
343 | ||
344 | static int32_t getJoiningGroup(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
345 | return ubidi_getJoiningGroup(GET_BIDI_PROPS(), c); | |
346 | } | |
347 | ||
348 | static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
349 | return ubidi_getJoiningType(GET_BIDI_PROPS(), c); | |
350 | } | |
351 | ||
352 | static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
4388f060 | 353 | int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c)); |
729e4ab9 A |
354 | return UPROPS_NTV_GET_TYPE(ntv); |
355 | } | |
356 | ||
357 | static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
358 | UErrorCode errorCode=U_ZERO_ERROR; | |
359 | return (int32_t)uscript_getScript(c, &errorCode); | |
360 | } | |
361 | ||
362 | /* | |
363 | * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. | |
364 | * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. | |
365 | */ | |
366 | static const UHangulSyllableType gcbToHst[]={ | |
367 | U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */ | |
368 | U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */ | |
369 | U_HST_NOT_APPLICABLE, /* U_GCB_CR */ | |
370 | U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */ | |
371 | U_HST_LEADING_JAMO, /* U_GCB_L */ | |
372 | U_HST_NOT_APPLICABLE, /* U_GCB_LF */ | |
373 | U_HST_LV_SYLLABLE, /* U_GCB_LV */ | |
374 | U_HST_LVT_SYLLABLE, /* U_GCB_LVT */ | |
375 | U_HST_TRAILING_JAMO, /* U_GCB_T */ | |
376 | U_HST_VOWEL_JAMO /* U_GCB_V */ | |
377 | /* | |
378 | * Omit GCB values beyond what we need for hst. | |
379 | * The code below checks for the array length. | |
380 | */ | |
381 | }; | |
382 | ||
383 | static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
384 | /* see comments on gcbToHst[] above */ | |
385 | int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT; | |
b331163b | 386 | if(gcb<UPRV_LENGTHOF(gcbToHst)) { |
729e4ab9 A |
387 | return gcbToHst[gcb]; |
388 | } else { | |
389 | return U_HST_NOT_APPLICABLE; | |
390 | } | |
391 | } | |
392 | ||
393 | #if UCONFIG_NO_NORMALIZATION | |
394 | static int32_t getNormQuickCheck(const IntProperty &, UChar32, UProperty) { | |
395 | return 0; | |
396 | } | |
397 | #else | |
398 | static int32_t getNormQuickCheck(const IntProperty &/*prop*/, UChar32 c, UProperty which) { | |
399 | return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK+UNORM_NFD)); | |
400 | } | |
401 | #endif | |
402 | ||
403 | #if UCONFIG_NO_NORMALIZATION | |
404 | static int32_t getLeadCombiningClass(const IntProperty &, UChar32, UProperty) { | |
405 | return 0; | |
406 | } | |
407 | #else | |
408 | static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
4388f060 | 409 | return unorm_getFCD16(c)>>8; |
729e4ab9 A |
410 | } |
411 | #endif | |
412 | ||
413 | #if UCONFIG_NO_NORMALIZATION | |
414 | static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) { | |
415 | return 0; | |
416 | } | |
417 | #else | |
418 | static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
4388f060 | 419 | return unorm_getFCD16(c)&0xff; |
729e4ab9 A |
420 | } |
421 | #endif | |
422 | ||
423 | static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={ | |
424 | /* | |
425 | * column, mask and shift values for int-value properties from u_getUnicodeProperties(). | |
426 | * Must be in order of corresponding UProperty, | |
427 | * and there must be exactly one entry per int UProperty. | |
428 | * | |
4388f060 | 429 | * Properties with mask==0 are handled in code. |
729e4ab9 A |
430 | * For them, column is the UPropertySource value. |
431 | */ | |
432 | { UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue }, | |
433 | { 0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
434 | { UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift }, | |
435 | { 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue }, | |
436 | { 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
437 | { UPROPS_SRC_CHAR, 0, (int32_t)U_CHAR_CATEGORY_COUNT-1,getGeneralCategory, getMaxValueFromShift }, | |
438 | { UPROPS_SRC_BIDI, 0, 0, getJoiningGroup, biDiGetMaxValue }, | |
439 | { UPROPS_SRC_BIDI, 0, 0, getJoiningType, biDiGetMaxValue }, | |
440 | { 2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
441 | { UPROPS_SRC_CHAR, 0, (int32_t)U_NT_COUNT-1, getNumericType, getMaxValueFromShift }, | |
442 | { 0, UPROPS_SCRIPT_MASK, 0, getScript, defaultGetMaxValue }, | |
443 | { UPROPS_SRC_PROPSVEC, 0, (int32_t)U_HST_COUNT-1, getHangulSyllableType, getMaxValueFromShift }, | |
444 | // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" | |
445 | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, | |
446 | // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" | |
447 | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, | |
448 | // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE | |
449 | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, | |
450 | // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE | |
451 | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, | |
452 | { UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass, getMaxValueFromShift }, | |
453 | { UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass, getMaxValueFromShift }, | |
454 | { 2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
455 | { 2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
57a6839d A |
456 | { 2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
457 | { UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType, biDiGetMaxValue }, | |
729e4ab9 A |
458 | }; |
459 | ||
460 | U_CAPI int32_t U_EXPORT2 | |
461 | u_getIntPropertyValue(UChar32 c, UProperty which) { | |
462 | if(which<UCHAR_INT_START) { | |
463 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { | |
464 | const BinaryProperty &prop=binProps[which]; | |
465 | return prop.contains(prop, c, which); | |
466 | } | |
467 | } else if(which<UCHAR_INT_LIMIT) { | |
468 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; | |
469 | return prop.getValue(prop, c, which); | |
470 | } else if(which==UCHAR_GENERAL_CATEGORY_MASK) { | |
471 | return U_MASK(u_charType(c)); | |
472 | } | |
473 | return 0; // undefined | |
474 | } | |
475 | ||
476 | U_CAPI int32_t U_EXPORT2 | |
477 | u_getIntPropertyMinValue(UProperty /*which*/) { | |
478 | return 0; /* all binary/enum/int properties have a minimum value of 0 */ | |
479 | } | |
480 | ||
481 | U_CAPI int32_t U_EXPORT2 | |
482 | u_getIntPropertyMaxValue(UProperty which) { | |
483 | if(which<UCHAR_INT_START) { | |
484 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { | |
485 | return 1; // maximum TRUE for all binary properties | |
486 | } | |
487 | } else if(which<UCHAR_INT_LIMIT) { | |
488 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; | |
489 | return prop.getMaxValue(prop, which); | |
490 | } | |
491 | return -1; // undefined | |
492 | } | |
493 | ||
494 | U_CFUNC UPropertySource U_EXPORT2 | |
495 | uprops_getSource(UProperty which) { | |
496 | if(which<UCHAR_BINARY_START) { | |
497 | return UPROPS_SRC_NONE; /* undefined */ | |
498 | } else if(which<UCHAR_BINARY_LIMIT) { | |
499 | const BinaryProperty &prop=binProps[which]; | |
500 | if(prop.mask!=0) { | |
501 | return UPROPS_SRC_PROPSVEC; | |
502 | } else { | |
503 | return (UPropertySource)prop.column; | |
504 | } | |
505 | } else if(which<UCHAR_INT_START) { | |
506 | return UPROPS_SRC_NONE; /* undefined */ | |
507 | } else if(which<UCHAR_INT_LIMIT) { | |
508 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; | |
509 | if(prop.mask!=0) { | |
510 | return UPROPS_SRC_PROPSVEC; | |
511 | } else { | |
512 | return (UPropertySource)prop.column; | |
513 | } | |
514 | } else if(which<UCHAR_STRING_START) { | |
515 | switch(which) { | |
516 | case UCHAR_GENERAL_CATEGORY_MASK: | |
517 | case UCHAR_NUMERIC_VALUE: | |
518 | return UPROPS_SRC_CHAR; | |
519 | ||
520 | default: | |
521 | return UPROPS_SRC_NONE; | |
522 | } | |
523 | } else if(which<UCHAR_STRING_LIMIT) { | |
524 | switch(which) { | |
525 | case UCHAR_AGE: | |
526 | return UPROPS_SRC_PROPSVEC; | |
527 | ||
528 | case UCHAR_BIDI_MIRRORING_GLYPH: | |
529 | return UPROPS_SRC_BIDI; | |
530 | ||
531 | case UCHAR_CASE_FOLDING: | |
532 | case UCHAR_LOWERCASE_MAPPING: | |
533 | case UCHAR_SIMPLE_CASE_FOLDING: | |
534 | case UCHAR_SIMPLE_LOWERCASE_MAPPING: | |
535 | case UCHAR_SIMPLE_TITLECASE_MAPPING: | |
536 | case UCHAR_SIMPLE_UPPERCASE_MAPPING: | |
537 | case UCHAR_TITLECASE_MAPPING: | |
538 | case UCHAR_UPPERCASE_MAPPING: | |
539 | return UPROPS_SRC_CASE; | |
540 | ||
541 | case UCHAR_ISO_COMMENT: | |
542 | case UCHAR_NAME: | |
543 | case UCHAR_UNICODE_1_NAME: | |
544 | return UPROPS_SRC_NAMES; | |
545 | ||
546 | default: | |
547 | return UPROPS_SRC_NONE; | |
548 | } | |
549 | } else { | |
550 | switch(which) { | |
551 | case UCHAR_SCRIPT_EXTENSIONS: | |
552 | return UPROPS_SRC_PROPSVEC; | |
553 | default: | |
554 | return UPROPS_SRC_NONE; /* undefined */ | |
555 | } | |
556 | } | |
557 | } | |
558 | ||
559 | #if !UCONFIG_NO_NORMALIZATION | |
560 | ||
561 | U_CAPI int32_t U_EXPORT2 | |
562 | u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) { | |
563 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
564 | return 0; | |
565 | } | |
566 | if(destCapacity<0 || (dest==NULL && destCapacity>0)) { | |
567 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
568 | return 0; | |
569 | } | |
570 | // Compute the FC_NFKC_Closure on the fly: | |
571 | // We have the API for complete coverage of Unicode properties, although | |
572 | // this value by itself is not useful via API. | |
573 | // (What could be useful is a custom normalization table that combines | |
574 | // case folding and NFKC.) | |
575 | // For the derivation, see Unicode's DerivedNormalizationProps.txt. | |
b331163b | 576 | const Normalizer2 *nfkc=Normalizer2::getNFKCInstance(*pErrorCode); |
729e4ab9 A |
577 | const UCaseProps *csp=ucase_getSingleton(); |
578 | if(U_FAILURE(*pErrorCode)) { | |
579 | return 0; | |
580 | } | |
581 | // first: b = NFKC(Fold(a)) | |
582 | UnicodeString folded1String; | |
583 | const UChar *folded1; | |
584 | int32_t folded1Length=ucase_toFullFolding(csp, c, &folded1, U_FOLD_CASE_DEFAULT); | |
585 | if(folded1Length<0) { | |
586 | const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc); | |
587 | if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) { | |
588 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); // c does not change at all under CaseFolding+NFKC | |
589 | } | |
590 | folded1String.setTo(c); | |
591 | } else { | |
592 | if(folded1Length>UCASE_MAX_STRING_LENGTH) { | |
593 | folded1String.setTo(folded1Length); | |
594 | } else { | |
595 | folded1String.setTo(FALSE, folded1, folded1Length); | |
596 | } | |
597 | } | |
598 | UnicodeString kc1=nfkc->normalize(folded1String, *pErrorCode); | |
599 | // second: c = NFKC(Fold(b)) | |
600 | UnicodeString folded2String(kc1); | |
601 | UnicodeString kc2=nfkc->normalize(folded2String.foldCase(), *pErrorCode); | |
602 | // if (c != b) add the mapping from a to c | |
603 | if(U_FAILURE(*pErrorCode) || kc1==kc2) { | |
604 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); | |
605 | } else { | |
606 | return kc2.extract(dest, destCapacity, *pErrorCode); | |
607 | } | |
608 | } | |
609 | ||
610 | #endif |