1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 // characterproperties.cpp
5 // created: 2018sep03 Markus W. Scherer
7 #include "unicode/utypes.h"
8 #include "unicode/localpointer.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/ucptrie.h"
12 #include "unicode/umutablecptrie.h"
13 #include "unicode/uniset.h"
14 #include "unicode/uscript.h"
15 #include "unicode/uset.h"
18 #include "normalizer2impl.h"
20 #include "ubidi_props.h"
26 using icu::LocalPointer
;
27 #if !UCONFIG_NO_NORMALIZATION
28 using icu::Normalizer2Factory
;
29 using icu::Normalizer2Impl
;
32 using icu::UnicodeSet
;
36 UBool U_CALLCONV
characterproperties_cleanup();
38 constexpr int32_t NUM_INCLUSIONS
= UPROPS_SRC_COUNT
+ UCHAR_INT_LIMIT
- UCHAR_INT_START
;
41 UnicodeSet
*fSet
= nullptr;
42 UInitOnce fInitOnce
= U_INITONCE_INITIALIZER
;
44 Inclusion gInclusions
[NUM_INCLUSIONS
]; // cached getInclusions()
46 UnicodeSet
*sets
[UCHAR_BINARY_LIMIT
] = {};
48 UCPMap
*maps
[UCHAR_INT_LIMIT
- UCHAR_INT_START
] = {};
50 icu::UMutex
*cpMutex() {
51 static icu::UMutex
*m
= STATIC_NEW(icu::UMutex
);
55 //----------------------------------------------------------------
57 //----------------------------------------------------------------
59 // USetAdder implementation
60 // Does not use uset.h to reduce code dependencies
62 _set_add(USet
*set
, UChar32 c
) {
63 ((UnicodeSet
*)set
)->add(c
);
67 _set_addRange(USet
*set
, UChar32 start
, UChar32 end
) {
68 ((UnicodeSet
*)set
)->add(start
, end
);
72 _set_addString(USet
*set
, const UChar
*str
, int32_t length
) {
73 ((UnicodeSet
*)set
)->add(icu::UnicodeString((UBool
)(length
<0), str
, length
));
76 UBool U_CALLCONV
characterproperties_cleanup() {
77 for (Inclusion
&in
: gInclusions
) {
82 for (int32_t i
= 0; i
< UPRV_LENGTHOF(sets
); ++i
) {
86 for (int32_t i
= 0; i
< UPRV_LENGTHOF(maps
); ++i
) {
87 ucptrie_close(reinterpret_cast<UCPTrie
*>(maps
[i
]));
93 void U_CALLCONV
initInclusion(UPropertySource src
, UErrorCode
&errorCode
) {
94 // This function is invoked only via umtx_initOnce().
95 U_ASSERT(0 <= src
&& src
< UPROPS_SRC_COUNT
);
96 if (src
== UPROPS_SRC_NONE
) {
97 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
100 U_ASSERT(gInclusions
[src
].fSet
== nullptr);
102 LocalPointer
<UnicodeSet
> incl(new UnicodeSet());
104 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
108 (USet
*)incl
.getAlias(),
112 nullptr, // don't need remove()
113 nullptr // don't need removeRange()
117 case UPROPS_SRC_CHAR
:
118 uchar_addPropertyStarts(&sa
, &errorCode
);
120 case UPROPS_SRC_PROPSVEC
:
121 upropsvec_addPropertyStarts(&sa
, &errorCode
);
123 case UPROPS_SRC_CHAR_AND_PROPSVEC
:
124 uchar_addPropertyStarts(&sa
, &errorCode
);
125 upropsvec_addPropertyStarts(&sa
, &errorCode
);
127 #if !UCONFIG_NO_NORMALIZATION
128 case UPROPS_SRC_CASE_AND_NORM
: {
129 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFCImpl(errorCode
);
130 if(U_SUCCESS(errorCode
)) {
131 impl
->addPropertyStarts(&sa
, errorCode
);
133 ucase_addPropertyStarts(&sa
, &errorCode
);
136 case UPROPS_SRC_NFC
: {
137 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFCImpl(errorCode
);
138 if(U_SUCCESS(errorCode
)) {
139 impl
->addPropertyStarts(&sa
, errorCode
);
143 case UPROPS_SRC_NFKC
: {
144 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFKCImpl(errorCode
);
145 if(U_SUCCESS(errorCode
)) {
146 impl
->addPropertyStarts(&sa
, errorCode
);
150 case UPROPS_SRC_NFKC_CF
: {
151 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFKC_CFImpl(errorCode
);
152 if(U_SUCCESS(errorCode
)) {
153 impl
->addPropertyStarts(&sa
, errorCode
);
157 case UPROPS_SRC_NFC_CANON_ITER
: {
158 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFCImpl(errorCode
);
159 if(U_SUCCESS(errorCode
)) {
160 impl
->addCanonIterPropertyStarts(&sa
, errorCode
);
165 case UPROPS_SRC_CASE
:
166 ucase_addPropertyStarts(&sa
, &errorCode
);
168 case UPROPS_SRC_BIDI
:
169 ubidi_addPropertyStarts(&sa
, &errorCode
);
171 case UPROPS_SRC_INPC
:
172 case UPROPS_SRC_INSC
:
174 uprops_addPropertyStarts((UPropertySource
)src
, &sa
, &errorCode
);
177 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
181 if (U_FAILURE(errorCode
)) {
184 if (incl
->isBogus()) {
185 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
188 // Compact for caching.
190 gInclusions
[src
].fSet
= incl
.orphan();
191 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES
, characterproperties_cleanup
);
194 const UnicodeSet
*getInclusionsForSource(UPropertySource src
, UErrorCode
&errorCode
) {
195 if (U_FAILURE(errorCode
)) { return nullptr; }
196 if (src
< 0 || UPROPS_SRC_COUNT
<= src
) {
197 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
200 Inclusion
&i
= gInclusions
[src
];
201 umtx_initOnce(i
.fInitOnce
, &initInclusion
, src
, errorCode
);
205 void U_CALLCONV
initIntPropInclusion(UProperty prop
, UErrorCode
&errorCode
) {
206 // This function is invoked only via umtx_initOnce().
207 U_ASSERT(UCHAR_INT_START
<= prop
&& prop
< UCHAR_INT_LIMIT
);
208 int32_t inclIndex
= UPROPS_SRC_COUNT
+ prop
- UCHAR_INT_START
;
209 U_ASSERT(gInclusions
[inclIndex
].fSet
== nullptr);
210 UPropertySource src
= uprops_getSource(prop
);
211 const UnicodeSet
*incl
= getInclusionsForSource(src
, errorCode
);
212 if (U_FAILURE(errorCode
)) {
216 LocalPointer
<UnicodeSet
> intPropIncl(new UnicodeSet(0, 0));
217 if (intPropIncl
.isNull()) {
218 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
221 int32_t numRanges
= incl
->getRangeCount();
222 int32_t prevValue
= 0;
223 for (int32_t i
= 0; i
< numRanges
; ++i
) {
224 UChar32 rangeEnd
= incl
->getRangeEnd(i
);
225 for (UChar32 c
= incl
->getRangeStart(i
); c
<= rangeEnd
; ++c
) {
226 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
227 int32_t value
= u_getIntPropertyValue(c
, prop
);
228 if (value
!= prevValue
) {
235 if (intPropIncl
->isBogus()) {
236 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
239 // Compact for caching.
240 intPropIncl
->compact();
241 gInclusions
[inclIndex
].fSet
= intPropIncl
.orphan();
242 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES
, characterproperties_cleanup
);
249 const UnicodeSet
*CharacterProperties::getInclusionsForProperty(
250 UProperty prop
, UErrorCode
&errorCode
) {
251 if (U_FAILURE(errorCode
)) { return nullptr; }
252 if (UCHAR_INT_START
<= prop
&& prop
< UCHAR_INT_LIMIT
) {
253 int32_t inclIndex
= UPROPS_SRC_COUNT
+ prop
- UCHAR_INT_START
;
254 Inclusion
&i
= gInclusions
[inclIndex
];
255 umtx_initOnce(i
.fInitOnce
, &initIntPropInclusion
, prop
, errorCode
);
258 UPropertySource src
= uprops_getSource(prop
);
259 return getInclusionsForSource(src
, errorCode
);
267 UnicodeSet
*makeSet(UProperty property
, UErrorCode
&errorCode
) {
268 if (U_FAILURE(errorCode
)) { return nullptr; }
269 LocalPointer
<UnicodeSet
> set(new UnicodeSet());
271 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
274 const UnicodeSet
*inclusions
=
275 icu::CharacterProperties::getInclusionsForProperty(property
, errorCode
);
276 if (U_FAILURE(errorCode
)) { return nullptr; }
277 int32_t numRanges
= inclusions
->getRangeCount();
278 UChar32 startHasProperty
= -1;
280 for (int32_t i
= 0; i
< numRanges
; ++i
) {
281 UChar32 rangeEnd
= inclusions
->getRangeEnd(i
);
282 for (UChar32 c
= inclusions
->getRangeStart(i
); c
<= rangeEnd
; ++c
) {
283 // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
284 if (u_hasBinaryProperty(c
, property
)) {
285 if (startHasProperty
< 0) {
286 // Transition from false to true.
287 startHasProperty
= c
;
289 } else if (startHasProperty
>= 0) {
290 // Transition from true to false.
291 set
->add(startHasProperty
, c
- 1);
292 startHasProperty
= -1;
296 if (startHasProperty
>= 0) {
297 set
->add(startHasProperty
, 0x10FFFF);
303 UCPMap
*makeMap(UProperty property
, UErrorCode
&errorCode
) {
304 if (U_FAILURE(errorCode
)) { return nullptr; }
305 uint32_t nullValue
= property
== UCHAR_SCRIPT
? USCRIPT_UNKNOWN
: 0;
306 icu::LocalUMutableCPTriePointer
mutableTrie(
307 umutablecptrie_open(nullValue
, nullValue
, &errorCode
));
308 const UnicodeSet
*inclusions
=
309 icu::CharacterProperties::getInclusionsForProperty(property
, errorCode
);
310 if (U_FAILURE(errorCode
)) { return nullptr; }
311 int32_t numRanges
= inclusions
->getRangeCount();
313 uint32_t value
= nullValue
;
315 for (int32_t i
= 0; i
< numRanges
; ++i
) {
316 UChar32 rangeEnd
= inclusions
->getRangeEnd(i
);
317 for (UChar32 c
= inclusions
->getRangeStart(i
); c
<= rangeEnd
; ++c
) {
318 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
319 uint32_t nextValue
= u_getIntPropertyValue(c
, property
);
320 if (value
!= nextValue
) {
321 if (value
!= nullValue
) {
322 umutablecptrie_setRange(mutableTrie
.getAlias(), start
, c
- 1, value
, &errorCode
);
330 umutablecptrie_setRange(mutableTrie
.getAlias(), start
, 0x10FFFF, value
, &errorCode
);
334 if (property
== UCHAR_BIDI_CLASS
|| property
== UCHAR_GENERAL_CATEGORY
) {
335 type
= UCPTRIE_TYPE_FAST
;
337 type
= UCPTRIE_TYPE_SMALL
;
339 UCPTrieValueWidth valueWidth
;
340 // TODO: UCharacterProperty.IntProperty
341 int32_t max
= u_getIntPropertyMaxValue(property
);
343 valueWidth
= UCPTRIE_VALUE_BITS_8
;
344 } else if (max
<= 0xffff) {
345 valueWidth
= UCPTRIE_VALUE_BITS_16
;
347 valueWidth
= UCPTRIE_VALUE_BITS_32
;
349 return reinterpret_cast<UCPMap
*>(
350 umutablecptrie_buildImmutable(mutableTrie
.getAlias(), type
, valueWidth
, &errorCode
));
357 U_CAPI
const USet
* U_EXPORT2
358 u_getBinaryPropertySet(UProperty property
, UErrorCode
*pErrorCode
) {
359 if (U_FAILURE(*pErrorCode
)) { return nullptr; }
360 if (property
< 0 || UCHAR_BINARY_LIMIT
<= property
) {
361 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
365 UnicodeSet
*set
= sets
[property
];
366 if (set
== nullptr) {
367 sets
[property
] = set
= makeSet(property
, *pErrorCode
);
369 if (U_FAILURE(*pErrorCode
)) { return nullptr; }
370 return set
->toUSet();
373 U_CAPI
const UCPMap
* U_EXPORT2
374 u_getIntPropertyMap(UProperty property
, UErrorCode
*pErrorCode
) {
375 if (U_FAILURE(*pErrorCode
)) { return nullptr; }
376 if (property
< UCHAR_INT_START
|| UCHAR_INT_LIMIT
<= property
) {
377 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
381 UCPMap
*map
= maps
[property
- UCHAR_INT_START
];
382 if (map
== nullptr) {
383 maps
[property
- UCHAR_INT_START
] = map
= makeMap(property
, *pErrorCode
);