]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/static_unicode_sets.cpp
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / common / static_unicode_sets.cpp
CommitLineData
0f5d89e8
A
1// © 2018 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4#include "unicode/utypes.h"
5
6#if !UCONFIG_NO_FORMATTING
7
8// Allow implicit conversion from char16_t* to UnicodeString for this file:
9// Helpful in toString methods and elsewhere.
10#define UNISTR_FROM_STRING_EXPLICIT
11
12#include "static_unicode_sets.h"
13#include "umutex.h"
14#include "ucln_cmn.h"
15#include "unicode/uniset.h"
16#include "uresimp.h"
17#include "cstring.h"
18#include "uassert.h"
19
20using namespace icu;
21using namespace icu::unisets;
22
23
24namespace {
25
26UnicodeSet* gUnicodeSets[COUNT] = {};
27
28// Save the empty instance in static memory to have well-defined behavior if a
29// regular UnicodeSet cannot be allocated.
30char gEmptyUnicodeSet[sizeof(UnicodeSet)];
31
32// Whether the gEmptyUnicodeSet is initialized and ready to use.
33UBool gEmptyUnicodeSetInitialized = FALSE;
34
35inline UnicodeSet* getImpl(Key key) {
36 UnicodeSet* candidate = gUnicodeSets[key];
37 if (candidate == nullptr) {
38 return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
39 }
40 return candidate;
41}
42
43UnicodeSet* computeUnion(Key k1, Key k2) {
44 UnicodeSet* result = new UnicodeSet();
45 if (result == nullptr) {
46 return nullptr;
47 }
48 result->addAll(*getImpl(k1));
49 result->addAll(*getImpl(k2));
50 result->freeze();
51 return result;
52}
53
54UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
55 UnicodeSet* result = new UnicodeSet();
56 if (result == nullptr) {
57 return nullptr;
58 }
59 result->addAll(*getImpl(k1));
60 result->addAll(*getImpl(k2));
61 result->addAll(*getImpl(k3));
62 result->freeze();
63 return result;
64}
65
66
67void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
68 // assert unicodeSets.get(key) == null;
69 gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
70}
71
72class ParseDataSink : public ResourceSink {
73 public:
74 void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE {
75 ResourceTable contextsTable = value.getTable(status);
76 if (U_FAILURE(status)) { return; }
77 for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
78 if (uprv_strcmp(key, "date") == 0) {
79 // ignore
80 } else {
81 ResourceTable strictnessTable = value.getTable(status);
82 if (U_FAILURE(status)) { return; }
83 for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
84 bool isLenient = (uprv_strcmp(key, "lenient") == 0);
85 ResourceArray array = value.getArray(status);
86 if (U_FAILURE(status)) { return; }
87 for (int k = 0; k < array.getSize(); k++) {
88 array.getValue(k, value);
89 UnicodeString str = value.getUnicodeString(status);
90 if (U_FAILURE(status)) { return; }
91 // There is both lenient and strict data for comma/period,
92 // but not for any of the other symbols.
93 if (str.indexOf(u'.') != -1) {
94 saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
95 } else if (str.indexOf(u',') != -1) {
96 saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
97 } else if (str.indexOf(u'+') != -1) {
98 saveSet(PLUS_SIGN, str, status);
99 } else if (str.indexOf(u'‒') != -1) {
100 saveSet(MINUS_SIGN, str, status);
101 } else if (str.indexOf(u'$') != -1) {
102 saveSet(DOLLAR_SIGN, str, status);
103 } else if (str.indexOf(u'£') != -1) {
104 saveSet(POUND_SIGN, str, status);
105 } else if (str.indexOf(u'₨') != -1) {
106 saveSet(RUPEE_SIGN, str, status);
107 }
108 if (U_FAILURE(status)) { return; }
109 }
110 }
111 }
112 }
113 }
114};
115
116
117icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
118
119UBool U_CALLCONV cleanupNumberParseUniSets() {
120 if (gEmptyUnicodeSetInitialized) {
121 reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
122 gEmptyUnicodeSetInitialized = FALSE;
123 }
124 for (int32_t i = 0; i < COUNT; i++) {
125 delete gUnicodeSets[i];
126 gUnicodeSets[i] = nullptr;
127 }
128 gNumberParseUniSetsInitOnce.reset();
129 return TRUE;
130}
131
132void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
133 ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
134
135 // Initialize the empty instance for well-defined fallback behavior
136 new(gEmptyUnicodeSet) UnicodeSet();
137 reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
138 gEmptyUnicodeSetInitialized = TRUE;
139
140 // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
141 // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
142 gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
143 u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
144 gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
145
146 LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
147 if (U_FAILURE(status)) { return; }
148 ParseDataSink sink;
149 ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
150 if (U_FAILURE(status)) { return; }
151
152 // NOTE: It is OK for these assertions to fail if there was a no-data build.
153 U_ASSERT(gUnicodeSets[COMMA] != nullptr);
154 U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
155 U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
156 U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
157
158 gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet(
159 u"['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status);
160 gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
161 gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
162 STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
163
164 U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
165 U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
166
167 gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status);
168 gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status);
169 gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);
170
171 U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
172 U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
173 U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
174 gUnicodeSets[YEN_SIGN] = new UnicodeSet(u"[¥\\uffe5]", status);
175
176 gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
177
178 gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
179 gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
180
181 for (auto* uniset : gUnicodeSets) {
182 if (uniset != nullptr) {
183 uniset->freeze();
184 }
185 }
186}
187
188}
189
190const UnicodeSet* unisets::get(Key key) {
191 UErrorCode localStatus = U_ZERO_ERROR;
192 umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
193 if (U_FAILURE(localStatus)) {
194 return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
195 }
196 return getImpl(key);
197}
198
199Key unisets::chooseFrom(UnicodeString str, Key key1) {
200 return get(key1)->contains(str) ? key1 : NONE;
201}
202
203Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
204 return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
205}
206
207//Key unisets::chooseCurrency(UnicodeString str) {
208// if (get(DOLLAR_SIGN)->contains(str)) {
209// return DOLLAR_SIGN;
210// } else if (get(POUND_SIGN)->contains(str)) {
211// return POUND_SIGN;
212// } else if (get(RUPEE_SIGN)->contains(str)) {
213// return RUPEE_SIGN;
214// } else if (get(YEN_SIGN)->contains(str)) {
215// return YEN_SIGN;
216// } else {
217// return NONE;
218// }
219//}
220
221
222#endif /* #if !UCONFIG_NO_FORMATTING */