Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
729e4ab9 A |
3 | /* |
4 | ******************************************************************************* | |
5 | * | |
2ca993e8 | 6 | * Copyright (C) 2002-2016, International Business Machines |
729e4ab9 A |
7 | * Corporation and others. All Rights Reserved. |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: uprops.cpp | |
f3c0d7a5 | 11 | * encoding: UTF-8 |
729e4ab9 A |
12 | * tab size: 8 (not used) |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2002feb24 | |
16 | * created by: Markus W. Scherer | |
17 | * | |
18 | * Implementations for mostly non-core Unicode character properties | |
19 | * stored in uprops.icu. | |
20 | * | |
21 | * With the APIs implemented here, almost all properties files and | |
22 | * their associated implementation files are used from this file, | |
23 | * including those for normalization and case mappings. | |
24 | */ | |
25 | ||
26 | #include "unicode/utypes.h" | |
27 | #include "unicode/uchar.h" | |
3d1f044b A |
28 | #include "unicode/ucptrie.h" |
29 | #include "unicode/udata.h" | |
729e4ab9 A |
30 | #include "unicode/unorm2.h" |
31 | #include "unicode/uscript.h" | |
32 | #include "unicode/ustring.h" | |
33 | #include "cstring.h" | |
3d1f044b | 34 | #include "mutex.h" |
729e4ab9 | 35 | #include "normalizer2impl.h" |
729e4ab9 A |
36 | #include "umutex.h" |
37 | #include "ubidi_props.h" | |
38 | #include "uprops.h" | |
39 | #include "ucase.h" | |
3d1f044b A |
40 | #include "ucln_cmn.h" |
41 | #include "ulayout_props.h" | |
729e4ab9 A |
42 | #include "ustr_imp.h" |
43 | ||
729e4ab9 A |
44 | U_NAMESPACE_USE |
45 | ||
3d1f044b A |
46 | // Unicode text layout properties data ----------------------------------------- |
47 | ||
48 | namespace { | |
49 | ||
50 | icu::UInitOnce gLayoutInitOnce = U_INITONCE_INITIALIZER; | |
51 | UDataMemory *gLayoutMemory = nullptr; | |
52 | ||
53 | UCPTrie *gInpcTrie = nullptr; // Indic_Positional_Category | |
54 | UCPTrie *gInscTrie = nullptr; // Indic_Syllabic_Category | |
55 | UCPTrie *gVoTrie = nullptr; // Vertical_Orientation | |
56 | ||
57 | int32_t gMaxInpcValue = 0; | |
58 | int32_t gMaxInscValue = 0; | |
59 | int32_t gMaxVoValue = 0; | |
60 | ||
61 | UBool U_CALLCONV uprops_cleanup() { | |
62 | udata_close(gLayoutMemory); | |
63 | gLayoutMemory = nullptr; | |
64 | ||
65 | ucptrie_close(gInpcTrie); | |
66 | gInpcTrie = nullptr; | |
67 | ucptrie_close(gInscTrie); | |
68 | gInscTrie = nullptr; | |
69 | ucptrie_close(gVoTrie); | |
70 | gVoTrie = nullptr; | |
71 | ||
72 | gMaxInpcValue = 0; | |
73 | gMaxInscValue = 0; | |
74 | gMaxVoValue = 0; | |
75 | ||
76 | gLayoutInitOnce.reset(); | |
77 | return TRUE; | |
78 | } | |
79 | ||
80 | UBool U_CALLCONV | |
81 | ulayout_isAcceptable(void * /*context*/, | |
82 | const char * /* type */, const char * /*name*/, | |
83 | const UDataInfo *pInfo) { | |
84 | return pInfo->size >= 20 && | |
85 | pInfo->isBigEndian == U_IS_BIG_ENDIAN && | |
86 | pInfo->charsetFamily == U_CHARSET_FAMILY && | |
87 | pInfo->dataFormat[0] == ULAYOUT_FMT_0 && | |
88 | pInfo->dataFormat[1] == ULAYOUT_FMT_1 && | |
89 | pInfo->dataFormat[2] == ULAYOUT_FMT_2 && | |
90 | pInfo->dataFormat[3] == ULAYOUT_FMT_3 && | |
91 | pInfo->formatVersion[0] == 1; | |
92 | } | |
93 | ||
94 | // UInitOnce singleton initialization function | |
95 | void U_CALLCONV ulayout_load(UErrorCode &errorCode) { | |
96 | gLayoutMemory = udata_openChoice( | |
97 | nullptr, ULAYOUT_DATA_TYPE, ULAYOUT_DATA_NAME, | |
98 | ulayout_isAcceptable, nullptr, &errorCode); | |
99 | if (U_FAILURE(errorCode)) { return; } | |
100 | ||
101 | const uint8_t *inBytes = (const uint8_t *)udata_getMemory(gLayoutMemory); | |
102 | const int32_t *inIndexes = (const int32_t *)inBytes; | |
103 | int32_t indexesLength = inIndexes[ULAYOUT_IX_INDEXES_LENGTH]; | |
104 | if (indexesLength < 12) { | |
105 | errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes. | |
106 | return; | |
107 | } | |
108 | int32_t offset = indexesLength * 4; | |
109 | int32_t top = inIndexes[ULAYOUT_IX_INPC_TRIE_TOP]; | |
110 | int32_t trieSize = top - offset; | |
111 | if (trieSize >= 16) { | |
112 | gInpcTrie = ucptrie_openFromBinary( | |
113 | UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, | |
114 | inBytes + offset, trieSize, nullptr, &errorCode); | |
115 | } | |
116 | offset = top; | |
117 | top = inIndexes[ULAYOUT_IX_INSC_TRIE_TOP]; | |
118 | trieSize = top - offset; | |
119 | if (trieSize >= 16) { | |
120 | gInscTrie = ucptrie_openFromBinary( | |
121 | UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, | |
122 | inBytes + offset, trieSize, nullptr, &errorCode); | |
123 | } | |
124 | offset = top; | |
125 | top = inIndexes[ULAYOUT_IX_VO_TRIE_TOP]; | |
126 | trieSize = top - offset; | |
127 | if (trieSize >= 16) { | |
128 | gVoTrie = ucptrie_openFromBinary( | |
129 | UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, | |
130 | inBytes + offset, trieSize, nullptr, &errorCode); | |
131 | } | |
132 | ||
133 | uint32_t maxValues = inIndexes[ULAYOUT_IX_MAX_VALUES]; | |
134 | gMaxInpcValue = maxValues >> ULAYOUT_MAX_INPC_SHIFT; | |
135 | gMaxInscValue = (maxValues >> ULAYOUT_MAX_INSC_SHIFT) & 0xff; | |
136 | gMaxVoValue = (maxValues >> ULAYOUT_MAX_VO_SHIFT) & 0xff; | |
137 | ||
138 | ucln_common_registerCleanup(UCLN_COMMON_UPROPS, uprops_cleanup); | |
139 | } | |
140 | ||
141 | UBool ulayout_ensureData(UErrorCode &errorCode) { | |
142 | if (U_FAILURE(errorCode)) { return FALSE; } | |
143 | umtx_initOnce(gLayoutInitOnce, &ulayout_load, errorCode); | |
144 | return U_SUCCESS(errorCode); | |
145 | } | |
146 | ||
147 | UBool ulayout_ensureData() { | |
148 | UErrorCode errorCode = U_ZERO_ERROR; | |
149 | return ulayout_ensureData(errorCode); | |
150 | } | |
151 | ||
152 | } // namespace | |
153 | ||
729e4ab9 A |
154 | /* general properties API functions ----------------------------------------- */ |
155 | ||
156 | struct BinaryProperty; | |
157 | ||
158 | typedef UBool BinaryPropertyContains(const BinaryProperty &prop, UChar32 c, UProperty which); | |
159 | ||
160 | struct BinaryProperty { | |
161 | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 | |
162 | uint32_t mask; | |
163 | BinaryPropertyContains *contains; | |
164 | }; | |
165 | ||
166 | static UBool defaultContains(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) { | |
167 | /* systematic, directly stored properties */ | |
168 | return (u_getUnicodeProperties(c, prop.column)&prop.mask)!=0; | |
169 | } | |
170 | ||
171 | static UBool caseBinaryPropertyContains(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { | |
3d1f044b | 172 | return static_cast<UBool>(ucase_hasBinaryProperty(c, which)); |
729e4ab9 A |
173 | } |
174 | ||
175 | static UBool isBidiControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
0f5d89e8 | 176 | return ubidi_isBidiControl(c); |
729e4ab9 A |
177 | } |
178 | ||
179 | static UBool isMirrored(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
0f5d89e8 | 180 | return ubidi_isMirrored(c); |
729e4ab9 A |
181 | } |
182 | ||
183 | static UBool isJoinControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
0f5d89e8 | 184 | return ubidi_isJoinControl(c); |
729e4ab9 A |
185 | } |
186 | ||
187 | #if UCONFIG_NO_NORMALIZATION | |
188 | static UBool hasFullCompositionExclusion(const BinaryProperty &, UChar32, UProperty) { | |
189 | return FALSE; | |
190 | } | |
191 | #else | |
192 | static UBool hasFullCompositionExclusion(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
193 | // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. | |
194 | UErrorCode errorCode=U_ZERO_ERROR; | |
195 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); | |
196 | return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c)); | |
197 | } | |
198 | #endif | |
199 | ||
200 | // UCHAR_NF*_INERT properties | |
201 | #if UCONFIG_NO_NORMALIZATION | |
202 | static UBool isNormInert(const BinaryProperty &, UChar32, UProperty) { | |
203 | return FALSE; | |
204 | } | |
205 | #else | |
206 | static UBool isNormInert(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { | |
207 | UErrorCode errorCode=U_ZERO_ERROR; | |
208 | const Normalizer2 *norm2=Normalizer2Factory::getInstance( | |
209 | (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode); | |
210 | return U_SUCCESS(errorCode) && norm2->isInert(c); | |
211 | } | |
212 | #endif | |
213 | ||
214 | #if UCONFIG_NO_NORMALIZATION | |
215 | static UBool changesWhenCasefolded(const BinaryProperty &, UChar32, UProperty) { | |
216 | return FALSE; | |
217 | } | |
218 | #else | |
219 | static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
220 | UnicodeString nfd; | |
221 | UErrorCode errorCode=U_ZERO_ERROR; | |
b331163b | 222 | const Normalizer2 *nfcNorm2=Normalizer2::getNFCInstance(errorCode); |
729e4ab9 A |
223 | if(U_FAILURE(errorCode)) { |
224 | return FALSE; | |
225 | } | |
226 | if(nfcNorm2->getDecomposition(c, nfd)) { | |
227 | /* c has a decomposition */ | |
228 | if(nfd.length()==1) { | |
229 | c=nfd[0]; /* single BMP code point */ | |
230 | } else if(nfd.length()<=U16_MAX_LENGTH && | |
231 | nfd.length()==U16_LENGTH(c=nfd.char32At(0)) | |
232 | ) { | |
233 | /* single supplementary code point */ | |
234 | } else { | |
235 | c=U_SENTINEL; | |
236 | } | |
237 | } else if(c<0) { | |
238 | return FALSE; /* protect against bad input */ | |
239 | } | |
240 | if(c>=0) { | |
241 | /* single code point */ | |
729e4ab9 | 242 | const UChar *resultString; |
f3c0d7a5 | 243 | return (UBool)(ucase_toFullFolding(c, &resultString, U_FOLD_CASE_DEFAULT)>=0); |
729e4ab9 A |
244 | } else { |
245 | /* guess some large but stack-friendly capacity */ | |
246 | UChar dest[2*UCASE_MAX_STRING_LENGTH]; | |
247 | int32_t destLength; | |
b331163b | 248 | destLength=u_strFoldCase(dest, UPRV_LENGTHOF(dest), |
729e4ab9 A |
249 | nfd.getBuffer(), nfd.length(), |
250 | U_FOLD_CASE_DEFAULT, &errorCode); | |
251 | return (UBool)(U_SUCCESS(errorCode) && | |
252 | 0!=u_strCompare(nfd.getBuffer(), nfd.length(), | |
253 | dest, destLength, FALSE)); | |
254 | } | |
255 | } | |
256 | #endif | |
257 | ||
258 | #if UCONFIG_NO_NORMALIZATION | |
259 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &, UChar32, UProperty) { | |
260 | return FALSE; | |
261 | } | |
262 | #else | |
263 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
264 | UErrorCode errorCode=U_ZERO_ERROR; | |
265 | const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode); | |
266 | if(U_FAILURE(errorCode)) { | |
267 | return FALSE; | |
268 | } | |
269 | UnicodeString src(c); | |
270 | UnicodeString dest; | |
271 | { | |
272 | // The ReorderingBuffer must be in a block because its destructor | |
273 | // needs to release dest's buffer before we look at its contents. | |
274 | ReorderingBuffer buffer(*kcf, dest); | |
275 | // Small destCapacity for NFKC_CF(c). | |
276 | if(buffer.init(5, errorCode)) { | |
277 | const UChar *srcArray=src.getBuffer(); | |
278 | kcf->compose(srcArray, srcArray+src.length(), FALSE, | |
279 | TRUE, buffer, errorCode); | |
280 | } | |
281 | } | |
282 | return U_SUCCESS(errorCode) && dest!=src; | |
283 | } | |
284 | #endif | |
285 | ||
286 | #if UCONFIG_NO_NORMALIZATION | |
287 | static UBool isCanonSegmentStarter(const BinaryProperty &, UChar32, UProperty) { | |
288 | return FALSE; | |
289 | } | |
290 | #else | |
291 | static UBool isCanonSegmentStarter(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
292 | UErrorCode errorCode=U_ZERO_ERROR; | |
293 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); | |
294 | return | |
295 | U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) && | |
296 | impl->isCanonSegmentStarter(c); | |
297 | } | |
298 | #endif | |
299 | ||
300 | static UBool isPOSIX_alnum(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
301 | return u_isalnumPOSIX(c); | |
302 | } | |
303 | ||
304 | static UBool isPOSIX_blank(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
305 | return u_isblank(c); | |
306 | } | |
307 | ||
308 | static UBool isPOSIX_graph(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
309 | return u_isgraphPOSIX(c); | |
310 | } | |
311 | ||
312 | static UBool isPOSIX_print(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
313 | return u_isprintPOSIX(c); | |
314 | } | |
315 | ||
316 | static UBool isPOSIX_xdigit(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
317 | return u_isxdigit(c); | |
318 | } | |
319 | ||
6be67b06 A |
320 | static UBool isRegionalIndicator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
321 | // Property starts are a subset of lb=RI etc. | |
322 | return 0x1F1E6<=c && c<=0x1F1FF; | |
323 | } | |
324 | ||
729e4ab9 A |
325 | static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={ |
326 | /* | |
327 | * column and mask values for binary properties from u_getUnicodeProperties(). | |
328 | * Must be in order of corresponding UProperty, | |
329 | * and there must be exactly one entry per binary UProperty. | |
330 | * | |
4388f060 | 331 | * Properties with mask==0 are handled in code. |
729e4ab9 A |
332 | * For them, column is the UPropertySource value. |
333 | */ | |
334 | { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains }, | |
335 | { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains }, | |
336 | { UPROPS_SRC_BIDI, 0, isBidiControl }, | |
337 | { UPROPS_SRC_BIDI, 0, isMirrored }, | |
338 | { 1, U_MASK(UPROPS_DASH), defaultContains }, | |
339 | { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains }, | |
340 | { 1, U_MASK(UPROPS_DEPRECATED), defaultContains }, | |
341 | { 1, U_MASK(UPROPS_DIACRITIC), defaultContains }, | |
342 | { 1, U_MASK(UPROPS_EXTENDER), defaultContains }, | |
343 | { UPROPS_SRC_NFC, 0, hasFullCompositionExclusion }, | |
344 | { 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains }, | |
345 | { 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains }, | |
346 | { 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains }, | |
347 | { 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains }, | |
348 | { 1, U_MASK(UPROPS_HYPHEN), defaultContains }, | |
349 | { 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains }, | |
350 | { 1, U_MASK(UPROPS_ID_START), defaultContains }, | |
351 | { 1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains }, | |
352 | { 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains }, | |
353 | { 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains }, | |
354 | { UPROPS_SRC_BIDI, 0, isJoinControl }, | |
355 | { 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains }, | |
356 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_LOWERCASE | |
357 | { 1, U_MASK(UPROPS_MATH), defaultContains }, | |
358 | { 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains }, | |
359 | { 1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains }, | |
360 | { 1, U_MASK(UPROPS_RADICAL), defaultContains }, | |
361 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_SOFT_DOTTED | |
362 | { 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains }, | |
363 | { 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains }, | |
364 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_UPPERCASE | |
365 | { 1, U_MASK(UPROPS_WHITE_SPACE), defaultContains }, | |
366 | { 1, U_MASK(UPROPS_XID_CONTINUE), defaultContains }, | |
367 | { 1, U_MASK(UPROPS_XID_START), defaultContains }, | |
368 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_SENSITIVE | |
369 | { 1, U_MASK(UPROPS_S_TERM), defaultContains }, | |
370 | { 1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains }, | |
371 | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFD_INERT | |
372 | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKD_INERT | |
373 | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFC_INERT | |
374 | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKC_INERT | |
375 | { UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter }, | |
376 | { 1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains }, | |
377 | { 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains }, | |
378 | { UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum }, | |
379 | { UPROPS_SRC_CHAR, 0, isPOSIX_blank }, | |
380 | { UPROPS_SRC_CHAR, 0, isPOSIX_graph }, | |
381 | { UPROPS_SRC_CHAR, 0, isPOSIX_print }, | |
382 | { UPROPS_SRC_CHAR, 0, isPOSIX_xdigit }, | |
383 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASED | |
384 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_IGNORABLE | |
385 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_LOWERCASED | |
386 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_UPPERCASED | |
387 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_TITLECASED | |
388 | { UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded }, | |
389 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED | |
2ca993e8 A |
390 | { UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded }, |
391 | { 2, U_MASK(UPROPS_2_EMOJI), defaultContains }, | |
392 | { 2, U_MASK(UPROPS_2_EMOJI_PRESENTATION), defaultContains }, | |
393 | { 2, U_MASK(UPROPS_2_EMOJI_MODIFIER), defaultContains }, | |
394 | { 2, U_MASK(UPROPS_2_EMOJI_MODIFIER_BASE), defaultContains }, | |
6be67b06 A |
395 | { 2, U_MASK(UPROPS_2_EMOJI_COMPONENT), defaultContains }, |
396 | { 2, 0, isRegionalIndicator }, | |
397 | { 1, U_MASK(UPROPS_PREPENDED_CONCATENATION_MARK), defaultContains }, | |
0f5d89e8 | 398 | { 2, U_MASK(UPROPS_2_EXTENDED_PICTOGRAPHIC), defaultContains }, |
729e4ab9 A |
399 | }; |
400 | ||
401 | U_CAPI UBool U_EXPORT2 | |
402 | u_hasBinaryProperty(UChar32 c, UProperty which) { | |
403 | /* c is range-checked in the functions that are called from here */ | |
404 | if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) { | |
405 | /* not a known binary property */ | |
406 | return FALSE; | |
407 | } else { | |
408 | const BinaryProperty &prop=binProps[which]; | |
409 | return prop.contains(prop, c, which); | |
410 | } | |
411 | } | |
412 | ||
0f5d89e8 A |
413 | // Apple-only specific version of the above |
414 | U_CAPI UBool U_EXPORT2 | |
415 | u_isEmoji(UChar32 c) { | |
416 | const BinaryProperty &prop=binProps[UCHAR_EMOJI]; | |
417 | return prop.contains(prop, c, UCHAR_EMOJI); | |
418 | } | |
419 | ||
729e4ab9 A |
420 | struct IntProperty; |
421 | ||
422 | typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which); | |
423 | typedef int32_t IntPropertyGetMaxValue(const IntProperty &prop, UProperty which); | |
424 | ||
425 | struct IntProperty { | |
426 | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 | |
427 | uint32_t mask; | |
428 | int32_t shift; // =maxValue if getMaxValueFromShift() is used | |
429 | IntPropertyGetValue *getValue; | |
430 | IntPropertyGetMaxValue *getMaxValue; | |
431 | }; | |
432 | ||
433 | static int32_t defaultGetValue(const IntProperty &prop, UChar32 c, UProperty /*which*/) { | |
434 | /* systematic, directly stored properties */ | |
435 | return (int32_t)(u_getUnicodeProperties(c, prop.column)&prop.mask)>>prop.shift; | |
436 | } | |
437 | ||
438 | static int32_t defaultGetMaxValue(const IntProperty &prop, UProperty /*which*/) { | |
439 | return (uprv_getMaxValues(prop.column)&prop.mask)>>prop.shift; | |
440 | } | |
441 | ||
442 | static int32_t getMaxValueFromShift(const IntProperty &prop, UProperty /*which*/) { | |
443 | return prop.shift; | |
444 | } | |
445 | ||
446 | static int32_t getBiDiClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
447 | return (int32_t)u_charDirection(c); | |
448 | } | |
449 | ||
57a6839d | 450 | static int32_t getBiDiPairedBracketType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
0f5d89e8 | 451 | return (int32_t)ubidi_getPairedBracketType(c); |
57a6839d A |
452 | } |
453 | ||
729e4ab9 | 454 | static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) { |
0f5d89e8 | 455 | return ubidi_getMaxValue(which); |
729e4ab9 A |
456 | } |
457 | ||
458 | #if UCONFIG_NO_NORMALIZATION | |
459 | static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) { | |
460 | return 0; | |
461 | } | |
462 | #else | |
463 | static int32_t getCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
464 | return u_getCombiningClass(c); | |
465 | } | |
466 | #endif | |
467 | ||
468 | static int32_t getGeneralCategory(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
469 | return (int32_t)u_charType(c); | |
470 | } | |
471 | ||
472 | static int32_t getJoiningGroup(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
0f5d89e8 | 473 | return ubidi_getJoiningGroup(c); |
729e4ab9 A |
474 | } |
475 | ||
476 | static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
0f5d89e8 | 477 | return ubidi_getJoiningType(c); |
729e4ab9 A |
478 | } |
479 | ||
480 | static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
4388f060 | 481 | int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c)); |
729e4ab9 A |
482 | return UPROPS_NTV_GET_TYPE(ntv); |
483 | } | |
484 | ||
485 | static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
486 | UErrorCode errorCode=U_ZERO_ERROR; | |
487 | return (int32_t)uscript_getScript(c, &errorCode); | |
488 | } | |
489 | ||
490 | /* | |
491 | * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. | |
492 | * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. | |
493 | */ | |
494 | static const UHangulSyllableType gcbToHst[]={ | |
495 | U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */ | |
496 | U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */ | |
497 | U_HST_NOT_APPLICABLE, /* U_GCB_CR */ | |
498 | U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */ | |
499 | U_HST_LEADING_JAMO, /* U_GCB_L */ | |
500 | U_HST_NOT_APPLICABLE, /* U_GCB_LF */ | |
501 | U_HST_LV_SYLLABLE, /* U_GCB_LV */ | |
502 | U_HST_LVT_SYLLABLE, /* U_GCB_LVT */ | |
503 | U_HST_TRAILING_JAMO, /* U_GCB_T */ | |
504 | U_HST_VOWEL_JAMO /* U_GCB_V */ | |
505 | /* | |
506 | * Omit GCB values beyond what we need for hst. | |
507 | * The code below checks for the array length. | |
508 | */ | |
509 | }; | |
510 | ||
511 | static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
512 | /* see comments on gcbToHst[] above */ | |
513 | int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT; | |
b331163b | 514 | if(gcb<UPRV_LENGTHOF(gcbToHst)) { |
729e4ab9 A |
515 | return gcbToHst[gcb]; |
516 | } else { | |
517 | return U_HST_NOT_APPLICABLE; | |
518 | } | |
519 | } | |
520 | ||
521 | #if UCONFIG_NO_NORMALIZATION | |
522 | static int32_t getNormQuickCheck(const IntProperty &, UChar32, UProperty) { | |
523 | return 0; | |
524 | } | |
525 | #else | |
526 | static int32_t getNormQuickCheck(const IntProperty &/*prop*/, UChar32 c, UProperty which) { | |
527 | return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK+UNORM_NFD)); | |
528 | } | |
529 | #endif | |
530 | ||
531 | #if UCONFIG_NO_NORMALIZATION | |
532 | static int32_t getLeadCombiningClass(const IntProperty &, UChar32, UProperty) { | |
533 | return 0; | |
534 | } | |
535 | #else | |
536 | static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
4388f060 | 537 | return unorm_getFCD16(c)>>8; |
729e4ab9 A |
538 | } |
539 | #endif | |
540 | ||
541 | #if UCONFIG_NO_NORMALIZATION | |
542 | static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) { | |
543 | return 0; | |
544 | } | |
545 | #else | |
546 | static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { | |
4388f060 | 547 | return unorm_getFCD16(c)&0xff; |
729e4ab9 A |
548 | } |
549 | #endif | |
550 | ||
3d1f044b A |
551 | static int32_t getInPC(const IntProperty &, UChar32 c, UProperty) { |
552 | return ulayout_ensureData() && gInpcTrie != nullptr ? ucptrie_get(gInpcTrie, c) : 0; | |
553 | } | |
554 | ||
555 | static int32_t getInSC(const IntProperty &, UChar32 c, UProperty) { | |
556 | return ulayout_ensureData() && gInscTrie != nullptr ? ucptrie_get(gInscTrie, c) : 0; | |
557 | } | |
558 | ||
559 | static int32_t getVo(const IntProperty &, UChar32 c, UProperty) { | |
560 | return ulayout_ensureData() && gVoTrie != nullptr ? ucptrie_get(gVoTrie, c) : 0; | |
561 | } | |
562 | ||
563 | static int32_t layoutGetMaxValue(const IntProperty &/*prop*/, UProperty which) { | |
564 | if (!ulayout_ensureData()) { return 0; } | |
565 | switch (which) { | |
566 | case UCHAR_INDIC_POSITIONAL_CATEGORY: | |
567 | return gMaxInpcValue; | |
568 | case UCHAR_INDIC_SYLLABIC_CATEGORY: | |
569 | return gMaxInscValue; | |
570 | case UCHAR_VERTICAL_ORIENTATION: | |
571 | return gMaxVoValue; | |
572 | default: | |
573 | return 0; | |
574 | } | |
575 | } | |
576 | ||
729e4ab9 A |
577 | static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={ |
578 | /* | |
579 | * column, mask and shift values for int-value properties from u_getUnicodeProperties(). | |
580 | * Must be in order of corresponding UProperty, | |
581 | * and there must be exactly one entry per int UProperty. | |
582 | * | |
4388f060 | 583 | * Properties with mask==0 are handled in code. |
729e4ab9 A |
584 | * For them, column is the UPropertySource value. |
585 | */ | |
586 | { UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue }, | |
587 | { 0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
588 | { UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift }, | |
589 | { 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue }, | |
590 | { 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
591 | { UPROPS_SRC_CHAR, 0, (int32_t)U_CHAR_CATEGORY_COUNT-1,getGeneralCategory, getMaxValueFromShift }, | |
592 | { UPROPS_SRC_BIDI, 0, 0, getJoiningGroup, biDiGetMaxValue }, | |
593 | { UPROPS_SRC_BIDI, 0, 0, getJoiningType, biDiGetMaxValue }, | |
594 | { 2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
595 | { UPROPS_SRC_CHAR, 0, (int32_t)U_NT_COUNT-1, getNumericType, getMaxValueFromShift }, | |
596 | { 0, UPROPS_SCRIPT_MASK, 0, getScript, defaultGetMaxValue }, | |
597 | { UPROPS_SRC_PROPSVEC, 0, (int32_t)U_HST_COUNT-1, getHangulSyllableType, getMaxValueFromShift }, | |
598 | // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" | |
599 | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, | |
600 | // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" | |
601 | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, | |
602 | // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE | |
603 | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, | |
604 | // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE | |
605 | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, | |
606 | { UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass, getMaxValueFromShift }, | |
607 | { UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass, getMaxValueFromShift }, | |
608 | { 2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
609 | { 2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue, defaultGetMaxValue }, | |
57a6839d A |
610 | { 2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
611 | { UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType, biDiGetMaxValue }, | |
3d1f044b A |
612 | { UPROPS_SRC_INPC, 0, 0, getInPC, layoutGetMaxValue }, |
613 | { UPROPS_SRC_INSC, 0, 0, getInSC, layoutGetMaxValue }, | |
614 | { UPROPS_SRC_VO, 0, 0, getVo, layoutGetMaxValue }, | |
729e4ab9 A |
615 | }; |
616 | ||
617 | U_CAPI int32_t U_EXPORT2 | |
618 | u_getIntPropertyValue(UChar32 c, UProperty which) { | |
619 | if(which<UCHAR_INT_START) { | |
620 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { | |
621 | const BinaryProperty &prop=binProps[which]; | |
622 | return prop.contains(prop, c, which); | |
623 | } | |
624 | } else if(which<UCHAR_INT_LIMIT) { | |
625 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; | |
626 | return prop.getValue(prop, c, which); | |
627 | } else if(which==UCHAR_GENERAL_CATEGORY_MASK) { | |
628 | return U_MASK(u_charType(c)); | |
629 | } | |
630 | return 0; // undefined | |
631 | } | |
632 | ||
633 | U_CAPI int32_t U_EXPORT2 | |
634 | u_getIntPropertyMinValue(UProperty /*which*/) { | |
635 | return 0; /* all binary/enum/int properties have a minimum value of 0 */ | |
636 | } | |
637 | ||
638 | U_CAPI int32_t U_EXPORT2 | |
639 | u_getIntPropertyMaxValue(UProperty which) { | |
640 | if(which<UCHAR_INT_START) { | |
641 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { | |
642 | return 1; // maximum TRUE for all binary properties | |
643 | } | |
644 | } else if(which<UCHAR_INT_LIMIT) { | |
645 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; | |
646 | return prop.getMaxValue(prop, which); | |
647 | } | |
648 | return -1; // undefined | |
649 | } | |
650 | ||
651 | U_CFUNC UPropertySource U_EXPORT2 | |
652 | uprops_getSource(UProperty which) { | |
653 | if(which<UCHAR_BINARY_START) { | |
654 | return UPROPS_SRC_NONE; /* undefined */ | |
655 | } else if(which<UCHAR_BINARY_LIMIT) { | |
656 | const BinaryProperty &prop=binProps[which]; | |
657 | if(prop.mask!=0) { | |
658 | return UPROPS_SRC_PROPSVEC; | |
659 | } else { | |
660 | return (UPropertySource)prop.column; | |
661 | } | |
662 | } else if(which<UCHAR_INT_START) { | |
663 | return UPROPS_SRC_NONE; /* undefined */ | |
664 | } else if(which<UCHAR_INT_LIMIT) { | |
665 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; | |
666 | if(prop.mask!=0) { | |
667 | return UPROPS_SRC_PROPSVEC; | |
668 | } else { | |
669 | return (UPropertySource)prop.column; | |
670 | } | |
671 | } else if(which<UCHAR_STRING_START) { | |
672 | switch(which) { | |
673 | case UCHAR_GENERAL_CATEGORY_MASK: | |
674 | case UCHAR_NUMERIC_VALUE: | |
675 | return UPROPS_SRC_CHAR; | |
676 | ||
677 | default: | |
678 | return UPROPS_SRC_NONE; | |
679 | } | |
680 | } else if(which<UCHAR_STRING_LIMIT) { | |
681 | switch(which) { | |
682 | case UCHAR_AGE: | |
683 | return UPROPS_SRC_PROPSVEC; | |
684 | ||
685 | case UCHAR_BIDI_MIRRORING_GLYPH: | |
686 | return UPROPS_SRC_BIDI; | |
687 | ||
688 | case UCHAR_CASE_FOLDING: | |
689 | case UCHAR_LOWERCASE_MAPPING: | |
690 | case UCHAR_SIMPLE_CASE_FOLDING: | |
691 | case UCHAR_SIMPLE_LOWERCASE_MAPPING: | |
692 | case UCHAR_SIMPLE_TITLECASE_MAPPING: | |
693 | case UCHAR_SIMPLE_UPPERCASE_MAPPING: | |
694 | case UCHAR_TITLECASE_MAPPING: | |
695 | case UCHAR_UPPERCASE_MAPPING: | |
696 | return UPROPS_SRC_CASE; | |
697 | ||
698 | case UCHAR_ISO_COMMENT: | |
699 | case UCHAR_NAME: | |
700 | case UCHAR_UNICODE_1_NAME: | |
701 | return UPROPS_SRC_NAMES; | |
702 | ||
703 | default: | |
704 | return UPROPS_SRC_NONE; | |
705 | } | |
706 | } else { | |
707 | switch(which) { | |
708 | case UCHAR_SCRIPT_EXTENSIONS: | |
709 | return UPROPS_SRC_PROPSVEC; | |
710 | default: | |
711 | return UPROPS_SRC_NONE; /* undefined */ | |
712 | } | |
713 | } | |
714 | } | |
715 | ||
3d1f044b A |
716 | U_CFUNC void U_EXPORT2 |
717 | uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode) { | |
718 | if (!ulayout_ensureData(*pErrorCode)) { return; } | |
719 | const UCPTrie *trie; | |
720 | switch (src) { | |
721 | case UPROPS_SRC_INPC: | |
722 | trie = gInpcTrie; | |
723 | break; | |
724 | case UPROPS_SRC_INSC: | |
725 | trie = gInscTrie; | |
726 | break; | |
727 | case UPROPS_SRC_VO: | |
728 | trie = gVoTrie; | |
729 | break; | |
730 | default: | |
731 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
732 | return; | |
733 | } | |
734 | ||
735 | if (trie == nullptr) { | |
736 | *pErrorCode = U_MISSING_RESOURCE_ERROR; | |
737 | return; | |
738 | } | |
739 | ||
740 | // Add the start code point of each same-value range of the trie. | |
741 | UChar32 start = 0, end; | |
742 | while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, | |
743 | nullptr, nullptr, nullptr)) >= 0) { | |
744 | sa->add(sa->set, start); | |
745 | start = end + 1; | |
746 | } | |
747 | } | |
748 | ||
729e4ab9 A |
749 | #if !UCONFIG_NO_NORMALIZATION |
750 | ||
751 | U_CAPI int32_t U_EXPORT2 | |
752 | u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) { | |
753 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
754 | return 0; | |
755 | } | |
756 | if(destCapacity<0 || (dest==NULL && destCapacity>0)) { | |
757 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
758 | return 0; | |
759 | } | |
760 | // Compute the FC_NFKC_Closure on the fly: | |
761 | // We have the API for complete coverage of Unicode properties, although | |
762 | // this value by itself is not useful via API. | |
763 | // (What could be useful is a custom normalization table that combines | |
764 | // case folding and NFKC.) | |
765 | // For the derivation, see Unicode's DerivedNormalizationProps.txt. | |
b331163b | 766 | const Normalizer2 *nfkc=Normalizer2::getNFKCInstance(*pErrorCode); |
729e4ab9 A |
767 | if(U_FAILURE(*pErrorCode)) { |
768 | return 0; | |
769 | } | |
770 | // first: b = NFKC(Fold(a)) | |
771 | UnicodeString folded1String; | |
772 | const UChar *folded1; | |
f3c0d7a5 | 773 | int32_t folded1Length=ucase_toFullFolding(c, &folded1, U_FOLD_CASE_DEFAULT); |
729e4ab9 A |
774 | if(folded1Length<0) { |
775 | const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc); | |
776 | if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) { | |
777 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); // c does not change at all under CaseFolding+NFKC | |
778 | } | |
779 | folded1String.setTo(c); | |
780 | } else { | |
781 | if(folded1Length>UCASE_MAX_STRING_LENGTH) { | |
782 | folded1String.setTo(folded1Length); | |
783 | } else { | |
784 | folded1String.setTo(FALSE, folded1, folded1Length); | |
785 | } | |
786 | } | |
787 | UnicodeString kc1=nfkc->normalize(folded1String, *pErrorCode); | |
788 | // second: c = NFKC(Fold(b)) | |
789 | UnicodeString folded2String(kc1); | |
790 | UnicodeString kc2=nfkc->normalize(folded2String.foldCase(), *pErrorCode); | |
791 | // if (c != b) add the mapping from a to c | |
792 | if(U_FAILURE(*pErrorCode) || kc1==kc2) { | |
793 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); | |
794 | } else { | |
795 | return kc2.extract(dest, destCapacity, *pErrorCode); | |
796 | } | |
797 | } | |
798 | ||
799 | #endif |