]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/characterproperties.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / common / characterproperties.cpp
1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // characterproperties.cpp
5 // created: 2018sep03 Markus W. Scherer
6
7 #include "unicode/utypes.h"
8 #include "unicode/localpointer.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/ucptrie.h"
12 #include "unicode/umutablecptrie.h"
13 #include "unicode/uniset.h"
14 #include "unicode/uscript.h"
15 #include "unicode/uset.h"
16 #include "cmemory.h"
17 #include "mutex.h"
18 #include "normalizer2impl.h"
19 #include "uassert.h"
20 #include "ubidi_props.h"
21 #include "ucase.h"
22 #include "ucln_cmn.h"
23 #include "umutex.h"
24 #include "uprops.h"
25
26 using icu::LocalPointer;
27 #if !UCONFIG_NO_NORMALIZATION
28 using icu::Normalizer2Factory;
29 using icu::Normalizer2Impl;
30 #endif
31 using icu::UInitOnce;
32 using icu::UnicodeSet;
33
34 namespace {
35
36 UBool U_CALLCONV characterproperties_cleanup();
37
38 constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
39
40 struct Inclusion {
41 UnicodeSet *fSet = nullptr;
42 UInitOnce fInitOnce = U_INITONCE_INITIALIZER;
43 };
44 Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
45
46 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
47
48 UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
49
50 icu::UMutex *cpMutex() {
51 static icu::UMutex *m = STATIC_NEW(icu::UMutex);
52 return m;
53 }
54
55 //----------------------------------------------------------------
56 // Inclusions list
57 //----------------------------------------------------------------
58
59 // USetAdder implementation
60 // Does not use uset.h to reduce code dependencies
61 void U_CALLCONV
62 _set_add(USet *set, UChar32 c) {
63 ((UnicodeSet *)set)->add(c);
64 }
65
66 void U_CALLCONV
67 _set_addRange(USet *set, UChar32 start, UChar32 end) {
68 ((UnicodeSet *)set)->add(start, end);
69 }
70
71 void U_CALLCONV
72 _set_addString(USet *set, const UChar *str, int32_t length) {
73 ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
74 }
75
76 UBool U_CALLCONV characterproperties_cleanup() {
77 for (Inclusion &in: gInclusions) {
78 delete in.fSet;
79 in.fSet = nullptr;
80 in.fInitOnce.reset();
81 }
82 for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
83 delete sets[i];
84 sets[i] = nullptr;
85 }
86 for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
87 ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
88 maps[i] = nullptr;
89 }
90 return TRUE;
91 }
92
93 void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
94 // This function is invoked only via umtx_initOnce().
95 U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
96 if (src == UPROPS_SRC_NONE) {
97 errorCode = U_INTERNAL_PROGRAM_ERROR;
98 return;
99 }
100 U_ASSERT(gInclusions[src].fSet == nullptr);
101
102 LocalPointer<UnicodeSet> incl(new UnicodeSet());
103 if (incl.isNull()) {
104 errorCode = U_MEMORY_ALLOCATION_ERROR;
105 return;
106 }
107 USetAdder sa = {
108 (USet *)incl.getAlias(),
109 _set_add,
110 _set_addRange,
111 _set_addString,
112 nullptr, // don't need remove()
113 nullptr // don't need removeRange()
114 };
115
116 switch(src) {
117 case UPROPS_SRC_CHAR:
118 uchar_addPropertyStarts(&sa, &errorCode);
119 break;
120 case UPROPS_SRC_PROPSVEC:
121 upropsvec_addPropertyStarts(&sa, &errorCode);
122 break;
123 case UPROPS_SRC_CHAR_AND_PROPSVEC:
124 uchar_addPropertyStarts(&sa, &errorCode);
125 upropsvec_addPropertyStarts(&sa, &errorCode);
126 break;
127 #if !UCONFIG_NO_NORMALIZATION
128 case UPROPS_SRC_CASE_AND_NORM: {
129 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
130 if(U_SUCCESS(errorCode)) {
131 impl->addPropertyStarts(&sa, errorCode);
132 }
133 ucase_addPropertyStarts(&sa, &errorCode);
134 break;
135 }
136 case UPROPS_SRC_NFC: {
137 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
138 if(U_SUCCESS(errorCode)) {
139 impl->addPropertyStarts(&sa, errorCode);
140 }
141 break;
142 }
143 case UPROPS_SRC_NFKC: {
144 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
145 if(U_SUCCESS(errorCode)) {
146 impl->addPropertyStarts(&sa, errorCode);
147 }
148 break;
149 }
150 case UPROPS_SRC_NFKC_CF: {
151 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
152 if(U_SUCCESS(errorCode)) {
153 impl->addPropertyStarts(&sa, errorCode);
154 }
155 break;
156 }
157 case UPROPS_SRC_NFC_CANON_ITER: {
158 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
159 if(U_SUCCESS(errorCode)) {
160 impl->addCanonIterPropertyStarts(&sa, errorCode);
161 }
162 break;
163 }
164 #endif
165 case UPROPS_SRC_CASE:
166 ucase_addPropertyStarts(&sa, &errorCode);
167 break;
168 case UPROPS_SRC_BIDI:
169 ubidi_addPropertyStarts(&sa, &errorCode);
170 break;
171 case UPROPS_SRC_INPC:
172 case UPROPS_SRC_INSC:
173 case UPROPS_SRC_VO:
174 uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
175 break;
176 default:
177 errorCode = U_INTERNAL_PROGRAM_ERROR;
178 break;
179 }
180
181 if (U_FAILURE(errorCode)) {
182 return;
183 }
184 if (incl->isBogus()) {
185 errorCode = U_MEMORY_ALLOCATION_ERROR;
186 return;
187 }
188 // Compact for caching.
189 incl->compact();
190 gInclusions[src].fSet = incl.orphan();
191 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
192 }
193
194 const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
195 if (U_FAILURE(errorCode)) { return nullptr; }
196 if (src < 0 || UPROPS_SRC_COUNT <= src) {
197 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
198 return nullptr;
199 }
200 Inclusion &i = gInclusions[src];
201 umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
202 return i.fSet;
203 }
204
205 void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
206 // This function is invoked only via umtx_initOnce().
207 U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
208 int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
209 U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
210 UPropertySource src = uprops_getSource(prop);
211 const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
212 if (U_FAILURE(errorCode)) {
213 return;
214 }
215
216 LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
217 if (intPropIncl.isNull()) {
218 errorCode = U_MEMORY_ALLOCATION_ERROR;
219 return;
220 }
221 int32_t numRanges = incl->getRangeCount();
222 int32_t prevValue = 0;
223 for (int32_t i = 0; i < numRanges; ++i) {
224 UChar32 rangeEnd = incl->getRangeEnd(i);
225 for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
226 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
227 int32_t value = u_getIntPropertyValue(c, prop);
228 if (value != prevValue) {
229 intPropIncl->add(c);
230 prevValue = value;
231 }
232 }
233 }
234
235 if (intPropIncl->isBogus()) {
236 errorCode = U_MEMORY_ALLOCATION_ERROR;
237 return;
238 }
239 // Compact for caching.
240 intPropIncl->compact();
241 gInclusions[inclIndex].fSet = intPropIncl.orphan();
242 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
243 }
244
245 } // namespace
246
247 U_NAMESPACE_BEGIN
248
249 const UnicodeSet *CharacterProperties::getInclusionsForProperty(
250 UProperty prop, UErrorCode &errorCode) {
251 if (U_FAILURE(errorCode)) { return nullptr; }
252 if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
253 int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
254 Inclusion &i = gInclusions[inclIndex];
255 umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
256 return i.fSet;
257 } else {
258 UPropertySource src = uprops_getSource(prop);
259 return getInclusionsForSource(src, errorCode);
260 }
261 }
262
263 U_NAMESPACE_END
264
265 namespace {
266
267 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
268 if (U_FAILURE(errorCode)) { return nullptr; }
269 LocalPointer<UnicodeSet> set(new UnicodeSet());
270 if (set.isNull()) {
271 errorCode = U_MEMORY_ALLOCATION_ERROR;
272 return nullptr;
273 }
274 const UnicodeSet *inclusions =
275 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
276 if (U_FAILURE(errorCode)) { return nullptr; }
277 int32_t numRanges = inclusions->getRangeCount();
278 UChar32 startHasProperty = -1;
279
280 for (int32_t i = 0; i < numRanges; ++i) {
281 UChar32 rangeEnd = inclusions->getRangeEnd(i);
282 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
283 // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
284 if (u_hasBinaryProperty(c, property)) {
285 if (startHasProperty < 0) {
286 // Transition from false to true.
287 startHasProperty = c;
288 }
289 } else if (startHasProperty >= 0) {
290 // Transition from true to false.
291 set->add(startHasProperty, c - 1);
292 startHasProperty = -1;
293 }
294 }
295 }
296 if (startHasProperty >= 0) {
297 set->add(startHasProperty, 0x10FFFF);
298 }
299 set->freeze();
300 return set.orphan();
301 }
302
303 UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
304 if (U_FAILURE(errorCode)) { return nullptr; }
305 uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
306 icu::LocalUMutableCPTriePointer mutableTrie(
307 umutablecptrie_open(nullValue, nullValue, &errorCode));
308 const UnicodeSet *inclusions =
309 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
310 if (U_FAILURE(errorCode)) { return nullptr; }
311 int32_t numRanges = inclusions->getRangeCount();
312 UChar32 start = 0;
313 uint32_t value = nullValue;
314
315 for (int32_t i = 0; i < numRanges; ++i) {
316 UChar32 rangeEnd = inclusions->getRangeEnd(i);
317 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
318 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
319 uint32_t nextValue = u_getIntPropertyValue(c, property);
320 if (value != nextValue) {
321 if (value != nullValue) {
322 umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
323 }
324 start = c;
325 value = nextValue;
326 }
327 }
328 }
329 if (value != 0) {
330 umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
331 }
332
333 UCPTrieType type;
334 if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
335 type = UCPTRIE_TYPE_FAST;
336 } else {
337 type = UCPTRIE_TYPE_SMALL;
338 }
339 UCPTrieValueWidth valueWidth;
340 // TODO: UCharacterProperty.IntProperty
341 int32_t max = u_getIntPropertyMaxValue(property);
342 if (max <= 0xff) {
343 valueWidth = UCPTRIE_VALUE_BITS_8;
344 } else if (max <= 0xffff) {
345 valueWidth = UCPTRIE_VALUE_BITS_16;
346 } else {
347 valueWidth = UCPTRIE_VALUE_BITS_32;
348 }
349 return reinterpret_cast<UCPMap *>(
350 umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
351 }
352
353 } // namespace
354
355 U_NAMESPACE_USE
356
357 U_CAPI const USet * U_EXPORT2
358 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
359 if (U_FAILURE(*pErrorCode)) { return nullptr; }
360 if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
361 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
362 return nullptr;
363 }
364 Mutex m(cpMutex());
365 UnicodeSet *set = sets[property];
366 if (set == nullptr) {
367 sets[property] = set = makeSet(property, *pErrorCode);
368 }
369 if (U_FAILURE(*pErrorCode)) { return nullptr; }
370 return set->toUSet();
371 }
372
373 U_CAPI const UCPMap * U_EXPORT2
374 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
375 if (U_FAILURE(*pErrorCode)) { return nullptr; }
376 if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
377 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
378 return nullptr;
379 }
380 Mutex m(cpMutex());
381 UCPMap *map = maps[property - UCHAR_INT_START];
382 if (map == nullptr) {
383 maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
384 }
385 return map;
386 }