1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 // characterproperties.cpp
5 // created: 2018sep03 Markus W. Scherer
7 #include "unicode/utypes.h"
8 #include "unicode/localpointer.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/ucptrie.h"
12 #include "unicode/umutablecptrie.h"
13 #include "unicode/uniset.h"
14 #include "unicode/uscript.h"
15 #include "unicode/uset.h"
18 #include "normalizer2impl.h"
20 #include "ubidi_props.h"
26 using icu::LocalPointer
;
27 #if !UCONFIG_NO_NORMALIZATION
28 using icu::Normalizer2Factory
;
29 using icu::Normalizer2Impl
;
32 using icu::UnicodeSet
;
36 UBool U_CALLCONV
characterproperties_cleanup();
38 constexpr int32_t NUM_INCLUSIONS
= UPROPS_SRC_COUNT
+ UCHAR_INT_LIMIT
- UCHAR_INT_START
;
41 UnicodeSet
*fSet
= nullptr;
42 UInitOnce fInitOnce
= U_INITONCE_INITIALIZER
;
44 Inclusion gInclusions
[NUM_INCLUSIONS
]; // cached getInclusions()
46 UnicodeSet
*sets
[UCHAR_BINARY_LIMIT
] = {};
48 UCPMap
*maps
[UCHAR_INT_LIMIT
- UCHAR_INT_START
] = {};
52 //----------------------------------------------------------------
54 //----------------------------------------------------------------
56 // USetAdder implementation
57 // Does not use uset.h to reduce code dependencies
59 _set_add(USet
*set
, UChar32 c
) {
60 ((UnicodeSet
*)set
)->add(c
);
64 _set_addRange(USet
*set
, UChar32 start
, UChar32 end
) {
65 ((UnicodeSet
*)set
)->add(start
, end
);
69 _set_addString(USet
*set
, const UChar
*str
, int32_t length
) {
70 ((UnicodeSet
*)set
)->add(icu::UnicodeString((UBool
)(length
<0), str
, length
));
73 UBool U_CALLCONV
characterproperties_cleanup() {
74 for (Inclusion
&in
: gInclusions
) {
79 for (int32_t i
= 0; i
< UPRV_LENGTHOF(sets
); ++i
) {
83 for (int32_t i
= 0; i
< UPRV_LENGTHOF(maps
); ++i
) {
84 ucptrie_close(reinterpret_cast<UCPTrie
*>(maps
[i
]));
90 void U_CALLCONV
initInclusion(UPropertySource src
, UErrorCode
&errorCode
) {
91 // This function is invoked only via umtx_initOnce().
92 U_ASSERT(0 <= src
&& src
< UPROPS_SRC_COUNT
);
93 if (src
== UPROPS_SRC_NONE
) {
94 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
97 U_ASSERT(gInclusions
[src
].fSet
== nullptr);
99 LocalPointer
<UnicodeSet
> incl(new UnicodeSet());
101 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
105 (USet
*)incl
.getAlias(),
109 nullptr, // don't need remove()
110 nullptr // don't need removeRange()
114 case UPROPS_SRC_CHAR
:
115 uchar_addPropertyStarts(&sa
, &errorCode
);
117 case UPROPS_SRC_PROPSVEC
:
118 upropsvec_addPropertyStarts(&sa
, &errorCode
);
120 case UPROPS_SRC_CHAR_AND_PROPSVEC
:
121 uchar_addPropertyStarts(&sa
, &errorCode
);
122 upropsvec_addPropertyStarts(&sa
, &errorCode
);
124 #if !UCONFIG_NO_NORMALIZATION
125 case UPROPS_SRC_CASE_AND_NORM
: {
126 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFCImpl(errorCode
);
127 if(U_SUCCESS(errorCode
)) {
128 impl
->addPropertyStarts(&sa
, errorCode
);
130 ucase_addPropertyStarts(&sa
, &errorCode
);
133 case UPROPS_SRC_NFC
: {
134 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFCImpl(errorCode
);
135 if(U_SUCCESS(errorCode
)) {
136 impl
->addPropertyStarts(&sa
, errorCode
);
140 case UPROPS_SRC_NFKC
: {
141 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFKCImpl(errorCode
);
142 if(U_SUCCESS(errorCode
)) {
143 impl
->addPropertyStarts(&sa
, errorCode
);
147 case UPROPS_SRC_NFKC_CF
: {
148 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFKC_CFImpl(errorCode
);
149 if(U_SUCCESS(errorCode
)) {
150 impl
->addPropertyStarts(&sa
, errorCode
);
154 case UPROPS_SRC_NFC_CANON_ITER
: {
155 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFCImpl(errorCode
);
156 if(U_SUCCESS(errorCode
)) {
157 impl
->addCanonIterPropertyStarts(&sa
, errorCode
);
162 case UPROPS_SRC_CASE
:
163 ucase_addPropertyStarts(&sa
, &errorCode
);
165 case UPROPS_SRC_BIDI
:
166 ubidi_addPropertyStarts(&sa
, &errorCode
);
168 case UPROPS_SRC_INPC
:
169 case UPROPS_SRC_INSC
:
171 uprops_addPropertyStarts((UPropertySource
)src
, &sa
, &errorCode
);
174 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
178 if (U_FAILURE(errorCode
)) {
181 if (incl
->isBogus()) {
182 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
185 // Compact for caching.
187 gInclusions
[src
].fSet
= incl
.orphan();
188 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES
, characterproperties_cleanup
);
191 const UnicodeSet
*getInclusionsForSource(UPropertySource src
, UErrorCode
&errorCode
) {
192 if (U_FAILURE(errorCode
)) { return nullptr; }
193 if (src
< 0 || UPROPS_SRC_COUNT
<= src
) {
194 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
197 Inclusion
&i
= gInclusions
[src
];
198 umtx_initOnce(i
.fInitOnce
, &initInclusion
, src
, errorCode
);
202 void U_CALLCONV
initIntPropInclusion(UProperty prop
, UErrorCode
&errorCode
) {
203 // This function is invoked only via umtx_initOnce().
204 U_ASSERT(UCHAR_INT_START
<= prop
&& prop
< UCHAR_INT_LIMIT
);
205 int32_t inclIndex
= UPROPS_SRC_COUNT
+ prop
- UCHAR_INT_START
;
206 U_ASSERT(gInclusions
[inclIndex
].fSet
== nullptr);
207 UPropertySource src
= uprops_getSource(prop
);
208 const UnicodeSet
*incl
= getInclusionsForSource(src
, errorCode
);
209 if (U_FAILURE(errorCode
)) {
213 LocalPointer
<UnicodeSet
> intPropIncl(new UnicodeSet(0, 0));
214 if (intPropIncl
.isNull()) {
215 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
218 int32_t numRanges
= incl
->getRangeCount();
219 int32_t prevValue
= 0;
220 for (int32_t i
= 0; i
< numRanges
; ++i
) {
221 UChar32 rangeEnd
= incl
->getRangeEnd(i
);
222 for (UChar32 c
= incl
->getRangeStart(i
); c
<= rangeEnd
; ++c
) {
223 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
224 int32_t value
= u_getIntPropertyValue(c
, prop
);
225 if (value
!= prevValue
) {
232 if (intPropIncl
->isBogus()) {
233 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
236 // Compact for caching.
237 intPropIncl
->compact();
238 gInclusions
[inclIndex
].fSet
= intPropIncl
.orphan();
239 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES
, characterproperties_cleanup
);
246 const UnicodeSet
*CharacterProperties::getInclusionsForProperty(
247 UProperty prop
, UErrorCode
&errorCode
) {
248 if (U_FAILURE(errorCode
)) { return nullptr; }
249 if (UCHAR_INT_START
<= prop
&& prop
< UCHAR_INT_LIMIT
) {
250 int32_t inclIndex
= UPROPS_SRC_COUNT
+ prop
- UCHAR_INT_START
;
251 Inclusion
&i
= gInclusions
[inclIndex
];
252 umtx_initOnce(i
.fInitOnce
, &initIntPropInclusion
, prop
, errorCode
);
255 UPropertySource src
= uprops_getSource(prop
);
256 return getInclusionsForSource(src
, errorCode
);
264 UnicodeSet
*makeSet(UProperty property
, UErrorCode
&errorCode
) {
265 if (U_FAILURE(errorCode
)) { return nullptr; }
266 LocalPointer
<UnicodeSet
> set(new UnicodeSet());
268 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
271 const UnicodeSet
*inclusions
=
272 icu::CharacterProperties::getInclusionsForProperty(property
, errorCode
);
273 if (U_FAILURE(errorCode
)) { return nullptr; }
274 int32_t numRanges
= inclusions
->getRangeCount();
275 UChar32 startHasProperty
= -1;
277 for (int32_t i
= 0; i
< numRanges
; ++i
) {
278 UChar32 rangeEnd
= inclusions
->getRangeEnd(i
);
279 for (UChar32 c
= inclusions
->getRangeStart(i
); c
<= rangeEnd
; ++c
) {
280 // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
281 if (u_hasBinaryProperty(c
, property
)) {
282 if (startHasProperty
< 0) {
283 // Transition from false to true.
284 startHasProperty
= c
;
286 } else if (startHasProperty
>= 0) {
287 // Transition from true to false.
288 set
->add(startHasProperty
, c
- 1);
289 startHasProperty
= -1;
293 if (startHasProperty
>= 0) {
294 set
->add(startHasProperty
, 0x10FFFF);
300 UCPMap
*makeMap(UProperty property
, UErrorCode
&errorCode
) {
301 if (U_FAILURE(errorCode
)) { return nullptr; }
302 uint32_t nullValue
= property
== UCHAR_SCRIPT
? USCRIPT_UNKNOWN
: 0;
303 icu::LocalUMutableCPTriePointer
mutableTrie(
304 umutablecptrie_open(nullValue
, nullValue
, &errorCode
));
305 const UnicodeSet
*inclusions
=
306 icu::CharacterProperties::getInclusionsForProperty(property
, errorCode
);
307 if (U_FAILURE(errorCode
)) { return nullptr; }
308 int32_t numRanges
= inclusions
->getRangeCount();
310 uint32_t value
= nullValue
;
312 for (int32_t i
= 0; i
< numRanges
; ++i
) {
313 UChar32 rangeEnd
= inclusions
->getRangeEnd(i
);
314 for (UChar32 c
= inclusions
->getRangeStart(i
); c
<= rangeEnd
; ++c
) {
315 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
316 uint32_t nextValue
= u_getIntPropertyValue(c
, property
);
317 if (value
!= nextValue
) {
318 if (value
!= nullValue
) {
319 umutablecptrie_setRange(mutableTrie
.getAlias(), start
, c
- 1, value
, &errorCode
);
327 umutablecptrie_setRange(mutableTrie
.getAlias(), start
, 0x10FFFF, value
, &errorCode
);
331 if (property
== UCHAR_BIDI_CLASS
|| property
== UCHAR_GENERAL_CATEGORY
) {
332 type
= UCPTRIE_TYPE_FAST
;
334 type
= UCPTRIE_TYPE_SMALL
;
336 UCPTrieValueWidth valueWidth
;
337 // TODO: UCharacterProperty.IntProperty
338 int32_t max
= u_getIntPropertyMaxValue(property
);
340 valueWidth
= UCPTRIE_VALUE_BITS_8
;
341 } else if (max
<= 0xffff) {
342 valueWidth
= UCPTRIE_VALUE_BITS_16
;
344 valueWidth
= UCPTRIE_VALUE_BITS_32
;
346 return reinterpret_cast<UCPMap
*>(
347 umutablecptrie_buildImmutable(mutableTrie
.getAlias(), type
, valueWidth
, &errorCode
));
354 U_CAPI
const USet
* U_EXPORT2
355 u_getBinaryPropertySet(UProperty property
, UErrorCode
*pErrorCode
) {
356 if (U_FAILURE(*pErrorCode
)) { return nullptr; }
357 if (property
< 0 || UCHAR_BINARY_LIMIT
<= property
) {
358 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
362 UnicodeSet
*set
= sets
[property
];
363 if (set
== nullptr) {
364 sets
[property
] = set
= makeSet(property
, *pErrorCode
);
366 if (U_FAILURE(*pErrorCode
)) { return nullptr; }
367 return set
->toUSet();
370 U_CAPI
const UCPMap
* U_EXPORT2
371 u_getIntPropertyMap(UProperty property
, UErrorCode
*pErrorCode
) {
372 if (U_FAILURE(*pErrorCode
)) { return nullptr; }
373 if (property
< UCHAR_INT_START
|| UCHAR_INT_LIMIT
<= property
) {
374 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
378 UCPMap
*map
= maps
[property
- UCHAR_INT_START
];
379 if (map
== nullptr) {
380 maps
[property
- UCHAR_INT_START
] = map
= makeMap(property
, *pErrorCode
);