[apple/icu.git] / icuSources / common / static_unicode_sets.cpp

// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT

#include "static_unicode_sets.h"
#include "umutex.h"
#include "ucln_cmn.h"
#include "unicode/uniset.h"
#include "uresimp.h"
#include "cstring.h"
#include "uassert.h"

using namespace icu;
using namespace icu::unisets;


namespace {

UnicodeSet* gUnicodeSets[COUNT] = {};

// Save the empty instance in static memory to have well-defined behavior if a
// regular UnicodeSet cannot be allocated.
char gEmptyUnicodeSet[sizeof(UnicodeSet)];

// Whether the gEmptyUnicodeSet is initialized and ready to use.
UBool gEmptyUnicodeSetInitialized = FALSE;

inline UnicodeSet* getImpl(Key key) {
    UnicodeSet* candidate = gUnicodeSets[key];
    if (candidate == nullptr) {
        return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
    }
    return candidate;
}

UnicodeSet* computeUnion(Key k1, Key k2) {
    UnicodeSet* result = new UnicodeSet();
    if (result == nullptr) {
        return nullptr;
    }
    result->addAll(*getImpl(k1));
    result->addAll(*getImpl(k2));
    result->freeze();
    return result;
}

UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
    UnicodeSet* result = new UnicodeSet();
    if (result == nullptr) {
        return nullptr;
    }
    result->addAll(*getImpl(k1));
    result->addAll(*getImpl(k2));
    result->addAll(*getImpl(k3));
    result->freeze();
    return result;
}


void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
    // assert unicodeSets.get(key) == null;
    gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
}

class ParseDataSink : public ResourceSink {
  public:
    void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE {
        ResourceTable contextsTable = value.getTable(status);
        if (U_FAILURE(status)) { return; }
        for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
            if (uprv_strcmp(key, "date") == 0) {
                // ignore
            } else {
                ResourceTable strictnessTable = value.getTable(status);
                if (U_FAILURE(status)) { return; }
                for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
                    bool isLenient = (uprv_strcmp(key, "lenient") == 0);
                    ResourceArray array = value.getArray(status);
                    if (U_FAILURE(status)) { return; }
                    for (int k = 0; k < array.getSize(); k++) {
                        array.getValue(k, value);
                        UnicodeString str = value.getUnicodeString(status);
                        if (U_FAILURE(status)) { return; }
                        // There is both lenient and strict data for comma/period,
                        // but not for any of the other symbols.
                        if (str.indexOf(u'.') != -1) {
                            saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
                        } else if (str.indexOf(u',') != -1) {
                            saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
                        } else if (str.indexOf(u'+') != -1) {
                            saveSet(PLUS_SIGN, str, status);
                        } else if (str.indexOf(u'‒') != -1) {
                            saveSet(MINUS_SIGN, str, status);
                        } else if (str.indexOf(u'$') != -1) {
                            saveSet(DOLLAR_SIGN, str, status);
                        } else if (str.indexOf(u'£') != -1) {
                            saveSet(POUND_SIGN, str, status);
                        } else if (str.indexOf(u'₨') != -1) {
                            saveSet(RUPEE_SIGN, str, status);
                        }
                        if (U_FAILURE(status)) { return; }
                    }
                }
            }
        }
    }
};


icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;

UBool U_CALLCONV cleanupNumberParseUniSets() {
    if (gEmptyUnicodeSetInitialized) {
        reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
        gEmptyUnicodeSetInitialized = FALSE;
    }
    for (int32_t i = 0; i < COUNT; i++) {
        delete gUnicodeSets[i];
        gUnicodeSets[i] = nullptr;
    }
    gNumberParseUniSetsInitOnce.reset();
    return TRUE;
}

void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
    ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);

    // Initialize the empty instance for well-defined fallback behavior
    new(gEmptyUnicodeSet) UnicodeSet();
    reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
    gEmptyUnicodeSetInitialized = TRUE;

    // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
    // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
    gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
            u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
    gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);

    LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
    if (U_FAILURE(status)) { return; }
    ParseDataSink sink;
    ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
    if (U_FAILURE(status)) { return; }

    // NOTE: It is OK for these assertions to fail if there was a no-data build.
    U_ASSERT(gUnicodeSets[COMMA] != nullptr);
    U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
    U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
    U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);

    gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet(
            u"['٬‘’＇\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status);
    gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
    gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
            STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);

    U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);

    gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status);
    gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status);
    gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);

    U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
    gUnicodeSets[YEN_SIGN] = new UnicodeSet(u"[¥\\uffe5]", status);

    gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);

    gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
    gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);

    for (auto* uniset : gUnicodeSets) {
        if (uniset != nullptr) {
            uniset->freeze();
        }
    }
}

}

const UnicodeSet* unisets::get(Key key) {
    UErrorCode localStatus = U_ZERO_ERROR;
    umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
    if (U_FAILURE(localStatus)) {
        return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
    }
    return getImpl(key);
}

Key unisets::chooseFrom(UnicodeString str, Key key1) {
    return get(key1)->contains(str) ? key1 : NONE;
}

Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
    return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
}

//Key unisets::chooseCurrency(UnicodeString str) {
//    if (get(DOLLAR_SIGN)->contains(str)) {
//        return DOLLAR_SIGN;
//    } else if (get(POUND_SIGN)->contains(str)) {
//        return POUND_SIGN;
//    } else if (get(RUPEE_SIGN)->contains(str)) {
//        return RUPEE_SIGN;
//    } else if (get(YEN_SIGN)->contains(str)) {
//        return YEN_SIGN;
//    } else {
//        return NONE;
//    }
//}


#endif /* #if !UCONFIG_NO_FORMATTING */
Commit	Line	Data
0f5d89e8 A	1	// © 2018 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3
	4	#include "unicode/utypes.h"
	5
	6	#if !UCONFIG_NO_FORMATTING
	7
	8	// Allow implicit conversion from char16_t* to UnicodeString for this file:
	9	// Helpful in toString methods and elsewhere.
	10	#define UNISTR_FROM_STRING_EXPLICIT
	11
	12	#include "static_unicode_sets.h"
	13	#include "umutex.h"
	14	#include "ucln_cmn.h"
	15	#include "unicode/uniset.h"
	16	#include "uresimp.h"
	17	#include "cstring.h"
	18	#include "uassert.h"
	19
	20	using namespace icu;
	21	using namespace icu::unisets;
	22
	23
	24	namespace {
	25
	26	UnicodeSet* gUnicodeSets[COUNT] = {};
	27
	28	// Save the empty instance in static memory to have well-defined behavior if a
	29	// regular UnicodeSet cannot be allocated.
	30	char gEmptyUnicodeSet[sizeof(UnicodeSet)];
	31
	32	// Whether the gEmptyUnicodeSet is initialized and ready to use.
	33	UBool gEmptyUnicodeSetInitialized = FALSE;
	34
	35	inline UnicodeSet* getImpl(Key key) {
	36	UnicodeSet* candidate = gUnicodeSets[key];
	37	if (candidate == nullptr) {
	38	return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
	39	}
	40	return candidate;
	41	}
	42
	43	UnicodeSet* computeUnion(Key k1, Key k2) {
	44	UnicodeSet* result = new UnicodeSet();
	45	if (result == nullptr) {
	46	return nullptr;
	47	}
	48	result->addAll(*getImpl(k1));
	49	result->addAll(*getImpl(k2));
	50	result->freeze();
	51	return result;
	52	}
	53
	54	UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
	55	UnicodeSet* result = new UnicodeSet();
	56	if (result == nullptr) {
	57	return nullptr;
	58	}
	59	result->addAll(*getImpl(k1));
	60	result->addAll(*getImpl(k2));
	61	result->addAll(*getImpl(k3));
	62	result->freeze();
	63	return result;
	64	}
65
66
67	void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
68	// assert unicodeSets.get(key) == null;
69	gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
70	}
71
72	class ParseDataSink : public ResourceSink {
73	public:
74	void put(const char* key, ResourceValue& value, UBool /noFallback/, UErrorCode& status) U_OVERRIDE {
75	ResourceTable contextsTable = value.getTable(status);
76	if (U_FAILURE(status)) { return; }
77	for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
78	if (uprv_strcmp(key, "date") == 0) {
79	// ignore
80	} else {
81	ResourceTable strictnessTable = value.getTable(status);
82	if (U_FAILURE(status)) { return; }
83	for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
84	bool isLenient = (uprv_strcmp(key, "lenient") == 0);
85	ResourceArray array = value.getArray(status);
86	if (U_FAILURE(status)) { return; }
87	for (int k = 0; k < array.getSize(); k++) {
88	array.getValue(k, value);
89	UnicodeString str = value.getUnicodeString(status);
90	if (U_FAILURE(status)) { return; }
91	// There is both lenient and strict data for comma/period,
92	// but not for any of the other symbols.
93	if (str.indexOf(u'.') != -1) {
94	saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
95	} else if (str.indexOf(u',') != -1) {
96	saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
97	} else if (str.indexOf(u'+') != -1) {
98	saveSet(PLUS_SIGN, str, status);
99	} else if (str.indexOf(u'‒') != -1) {
100	saveSet(MINUS_SIGN, str, status);
101	} else if (str.indexOf(u'$') != -1) {
102	saveSet(DOLLAR_SIGN, str, status);
103	} else if (str.indexOf(u'£') != -1) {
104	saveSet(POUND_SIGN, str, status);
105	} else if (str.indexOf(u'₨') != -1) {
106	saveSet(RUPEE_SIGN, str, status);
107	}
108	if (U_FAILURE(status)) { return; }
109	}
110	}
111	}
112	}
113	}
114	};
115
116
117	icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
118
119	UBool U_CALLCONV cleanupNumberParseUniSets() {
120	if (gEmptyUnicodeSetInitialized) {
121	reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
122	gEmptyUnicodeSetInitialized = FALSE;
123	}
124	for (int32_t i = 0; i < COUNT; i++) {
125	delete gUnicodeSets[i];
126	gUnicodeSets[i] = nullptr;
127	}
128	gNumberParseUniSetsInitOnce.reset();
129	return TRUE;
130	}
131
132	void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
133	ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
134
135	// Initialize the empty instance for well-defined fallback behavior
136	new(gEmptyUnicodeSet) UnicodeSet();
137	reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
138	gEmptyUnicodeSetInitialized = TRUE;
139
140	// These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
141	// Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
142	gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
143	u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
144	gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
145
146	LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
147	if (U_FAILURE(status)) { return; }
148	ParseDataSink sink;
149	ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
150	if (U_FAILURE(status)) { return; }
151
152	// NOTE: It is OK for these assertions to fail if there was a no-data build.
153	U_ASSERT(gUnicodeSets[COMMA] != nullptr);
154	U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
155	U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
156	U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
157
158	gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet(
159	u"['٬‘’＇\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status);
160	gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
161	gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
162	STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
163
164	U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
165	U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
166
167	gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status);
168	gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status);
169	gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);
170
171	U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
172	U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
173	U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
174	gUnicodeSets[YEN_SIGN] = new UnicodeSet(u"[¥\\uffe5]", status);
175
176	gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
177
178	gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
179	gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
180
181	for (auto* uniset : gUnicodeSets) {
182	if (uniset != nullptr) {
183	uniset->freeze();
184	}
185	}
186	}
187
188	}
189
190	const UnicodeSet* unisets::get(Key key) {
191	UErrorCode localStatus = U_ZERO_ERROR;
192	umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
193	if (U_FAILURE(localStatus)) {
194	return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
195	}
196	return getImpl(key);
197	}
198
199	Key unisets::chooseFrom(UnicodeString str, Key key1) {
200	return get(key1)->contains(str) ? key1 : NONE;
201	}
202
203	Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
204	return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
205	}
206
207	//Key unisets::chooseCurrency(UnicodeString str) {
208	// if (get(DOLLAR_SIGN)->contains(str)) {
209	// return DOLLAR_SIGN;
210	// } else if (get(POUND_SIGN)->contains(str)) {
211	// return POUND_SIGN;
212	// } else if (get(RUPEE_SIGN)->contains(str)) {
213	// return RUPEE_SIGN;
214	// } else if (get(YEN_SIGN)->contains(str)) {
215	// return YEN_SIGN;
216	// } else {
217	// return NONE;
218	// }
219	//}
220
221
222	#endif /* #if !UCONFIG_NO_FORMATTING */