1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 #include "unicode/utypes.h"
6 #if !UCONFIG_NO_FORMATTING
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
12 #include "static_unicode_sets.h"
15 #include "unicode/uniset.h"
21 using namespace icu::unisets
;
26 UnicodeSet
* gUnicodeSets
[UNISETS_KEY_COUNT
] = {};
28 // Save the empty instance in static memory to have well-defined behavior if a
29 // regular UnicodeSet cannot be allocated.
31 char gEmptyUnicodeSet
[sizeof(UnicodeSet
)];
33 // Whether the gEmptyUnicodeSet is initialized and ready to use.
34 UBool gEmptyUnicodeSetInitialized
= FALSE
;
36 inline UnicodeSet
* getImpl(Key key
) {
37 UnicodeSet
* candidate
= gUnicodeSets
[key
];
38 if (candidate
== nullptr) {
39 return reinterpret_cast<UnicodeSet
*>(gEmptyUnicodeSet
);
44 UnicodeSet
* computeUnion(Key k1
, Key k2
) {
45 UnicodeSet
* result
= new UnicodeSet();
46 if (result
== nullptr) {
49 result
->addAll(*getImpl(k1
));
50 result
->addAll(*getImpl(k2
));
55 UnicodeSet
* computeUnion(Key k1
, Key k2
, Key k3
) {
56 UnicodeSet
* result
= new UnicodeSet();
57 if (result
== nullptr) {
60 result
->addAll(*getImpl(k1
));
61 result
->addAll(*getImpl(k2
));
62 result
->addAll(*getImpl(k3
));
68 void saveSet(Key key
, const UnicodeString
& unicodeSetPattern
, UErrorCode
& status
) {
69 // assert unicodeSets.get(key) == null;
70 gUnicodeSets
[key
] = new UnicodeSet(unicodeSetPattern
, status
);
73 class ParseDataSink
: public ResourceSink
{
75 void put(const char* key
, ResourceValue
& value
, UBool
/*noFallback*/, UErrorCode
& status
) U_OVERRIDE
{
76 ResourceTable contextsTable
= value
.getTable(status
);
77 if (U_FAILURE(status
)) { return; }
78 for (int i
= 0; contextsTable
.getKeyAndValue(i
, key
, value
); i
++) {
79 if (uprv_strcmp(key
, "date") == 0) {
82 ResourceTable strictnessTable
= value
.getTable(status
);
83 if (U_FAILURE(status
)) { return; }
84 for (int j
= 0; strictnessTable
.getKeyAndValue(j
, key
, value
); j
++) {
85 bool isLenient
= (uprv_strcmp(key
, "lenient") == 0);
86 ResourceArray array
= value
.getArray(status
);
87 if (U_FAILURE(status
)) { return; }
88 for (int k
= 0; k
< array
.getSize(); k
++) {
89 array
.getValue(k
, value
);
90 UnicodeString str
= value
.getUnicodeString(status
);
91 if (U_FAILURE(status
)) { return; }
92 // There is both lenient and strict data for comma/period,
93 // but not for any of the other symbols.
94 if (str
.indexOf(u
'.') != -1) {
95 saveSet(isLenient
? PERIOD
: STRICT_PERIOD
, str
, status
);
96 } else if (str
.indexOf(u
',') != -1) {
97 saveSet(isLenient
? COMMA
: STRICT_COMMA
, str
, status
);
98 } else if (str
.indexOf(u
'+') != -1) {
99 saveSet(PLUS_SIGN
, str
, status
);
100 } else if (str
.indexOf(u
'-') != -1) {
101 saveSet(MINUS_SIGN
, str
, status
);
102 } else if (str
.indexOf(u
'$') != -1) {
103 saveSet(DOLLAR_SIGN
, str
, status
);
104 } else if (str
.indexOf(u
'£') != -1) {
105 saveSet(POUND_SIGN
, str
, status
);
106 } else if (str
.indexOf(u
'₹') != -1) {
107 saveSet(RUPEE_SIGN
, str
, status
);
108 } else if (str
.indexOf(u
'¥') != -1) {
109 saveSet(YEN_SIGN
, str
, status
);
110 } else if (str
.indexOf(u
'₩') != -1) {
111 saveSet(WON_SIGN
, str
, status
);
112 } else if (str
.indexOf(u
'%') != -1) {
113 saveSet(PERCENT_SIGN
, str
, status
);
114 } else if (str
.indexOf(u
'‰') != -1) {
115 saveSet(PERMILLE_SIGN
, str
, status
);
116 } else if (str
.indexOf(u
'’') != -1) {
117 saveSet(APOSTROPHE_SIGN
, str
, status
);
119 // Unknown class of parse lenients
120 // TODO(ICU-20428): Make ICU automatically accept new classes?
123 if (U_FAILURE(status
)) { return; }
132 icu::UInitOnce gNumberParseUniSetsInitOnce
= U_INITONCE_INITIALIZER
;
134 UBool U_CALLCONV
cleanupNumberParseUniSets() {
135 if (gEmptyUnicodeSetInitialized
) {
136 reinterpret_cast<UnicodeSet
*>(gEmptyUnicodeSet
)->~UnicodeSet();
137 gEmptyUnicodeSetInitialized
= FALSE
;
139 for (int32_t i
= 0; i
< UNISETS_KEY_COUNT
; i
++) {
140 delete gUnicodeSets
[i
];
141 gUnicodeSets
[i
] = nullptr;
143 gNumberParseUniSetsInitOnce
.reset();
147 void U_CALLCONV
initNumberParseUniSets(UErrorCode
& status
) {
148 ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS
, cleanupNumberParseUniSets
);
150 // Initialize the empty instance for well-defined fallback behavior
151 new(gEmptyUnicodeSet
) UnicodeSet();
152 reinterpret_cast<UnicodeSet
*>(gEmptyUnicodeSet
)->freeze();
153 gEmptyUnicodeSetInitialized
= TRUE
;
155 // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
156 // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
157 gUnicodeSets
[DEFAULT_IGNORABLES
] = new UnicodeSet(
158 u
"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status
);
159 gUnicodeSets
[STRICT_IGNORABLES
] = new UnicodeSet(u
"[[:Bidi_Control:]]", status
);
161 LocalUResourceBundlePointer
rb(ures_open(nullptr, "root", &status
));
162 if (U_FAILURE(status
)) { return; }
164 ures_getAllItemsWithFallback(rb
.getAlias(), "parse", sink
, status
);
165 if (U_FAILURE(status
)) { return; }
167 // NOTE: It is OK for these assertions to fail if there was a no-data build.
168 U_ASSERT(gUnicodeSets
[COMMA
] != nullptr);
169 U_ASSERT(gUnicodeSets
[STRICT_COMMA
] != nullptr);
170 U_ASSERT(gUnicodeSets
[PERIOD
] != nullptr);
171 U_ASSERT(gUnicodeSets
[STRICT_PERIOD
] != nullptr);
172 U_ASSERT(gUnicodeSets
[APOSTROPHE_SIGN
] != nullptr);
174 LocalPointer
<UnicodeSet
> otherGrouping(new UnicodeSet(
175 u
"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
178 if (U_FAILURE(status
)) { return; }
179 otherGrouping
->addAll(*gUnicodeSets
[APOSTROPHE_SIGN
]);
180 gUnicodeSets
[OTHER_GROUPING_SEPARATORS
] = otherGrouping
.orphan();
181 gUnicodeSets
[ALL_SEPARATORS
] = computeUnion(COMMA
, PERIOD
, OTHER_GROUPING_SEPARATORS
);
182 gUnicodeSets
[STRICT_ALL_SEPARATORS
] = computeUnion(
183 STRICT_COMMA
, STRICT_PERIOD
, OTHER_GROUPING_SEPARATORS
);
185 U_ASSERT(gUnicodeSets
[MINUS_SIGN
] != nullptr);
186 U_ASSERT(gUnicodeSets
[PLUS_SIGN
] != nullptr);
187 U_ASSERT(gUnicodeSets
[PERCENT_SIGN
] != nullptr);
188 U_ASSERT(gUnicodeSets
[PERMILLE_SIGN
] != nullptr);
190 gUnicodeSets
[INFINITY_SIGN
] = new UnicodeSet(u
"[∞]", status
);
191 if (U_FAILURE(status
)) { return; }
193 U_ASSERT(gUnicodeSets
[DOLLAR_SIGN
] != nullptr);
194 U_ASSERT(gUnicodeSets
[POUND_SIGN
] != nullptr);
195 U_ASSERT(gUnicodeSets
[RUPEE_SIGN
] != nullptr);
196 U_ASSERT(gUnicodeSets
[YEN_SIGN
] != nullptr);
197 U_ASSERT(gUnicodeSets
[WON_SIGN
] != nullptr);
199 gUnicodeSets
[DIGITS
] = new UnicodeSet(u
"[:digit:]", status
);
200 if (U_FAILURE(status
)) { return; }
201 gUnicodeSets
[DIGITS_OR_ALL_SEPARATORS
] = computeUnion(DIGITS
, ALL_SEPARATORS
);
202 gUnicodeSets
[DIGITS_OR_STRICT_ALL_SEPARATORS
] = computeUnion(DIGITS
, STRICT_ALL_SEPARATORS
);
204 for (auto* uniset
: gUnicodeSets
) {
205 if (uniset
!= nullptr) {
213 const UnicodeSet
* unisets::get(Key key
) {
214 UErrorCode localStatus
= U_ZERO_ERROR
;
215 umtx_initOnce(gNumberParseUniSetsInitOnce
, &initNumberParseUniSets
, localStatus
);
216 if (U_FAILURE(localStatus
)) {
217 return reinterpret_cast<UnicodeSet
*>(gEmptyUnicodeSet
);
222 Key
unisets::chooseFrom(UnicodeString str
, Key key1
) {
223 return get(key1
)->contains(str
) ? key1
: NONE
;
226 Key
unisets::chooseFrom(UnicodeString str
, Key key1
, Key key2
) {
227 return get(key1
)->contains(str
) ? key1
: chooseFrom(str
, key2
);
230 //Key unisets::chooseCurrency(UnicodeString str) {
231 // if (get(DOLLAR_SIGN)->contains(str)) {
232 // return DOLLAR_SIGN;
233 // } else if (get(POUND_SIGN)->contains(str)) {
234 // return POUND_SIGN;
235 // } else if (get(RUPEE_SIGN)->contains(str)) {
236 // return RUPEE_SIGN;
237 // } else if (get(YEN_SIGN)->contains(str)) {
245 #endif /* #if !UCONFIG_NO_FORMATTING */