icuSources/common/static_unicode_sets.cpp

   1 // © 2018 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3
   4 #include "unicode/utypes.h"
   5
   6 #if !UCONFIG_NO_FORMATTING
   7
   8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
   9 // Helpful in toString methods and elsewhere.
  10 #define UNISTR_FROM_STRING_EXPLICIT
  11
  12 #include "static_unicode_sets.h"
  13 #include "umutex.h"
  14 #include "ucln_cmn.h"
  15 #include "unicode/uniset.h"
  16 #include "uresimp.h"
  17 #include "cstring.h"
  18 #include "uassert.h"
  19
  20 using namespace icu;
  21 using namespace icu::unisets;
  22
  23
  24 namespace {
  25
  26 UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};
  27
  28 // Save the empty instance in static memory to have well-defined behavior if a
  29 // regular UnicodeSet cannot be allocated.
  30 alignas(UnicodeSet)
  31 char gEmptyUnicodeSet[sizeof(UnicodeSet)];
  32
  33 // Whether the gEmptyUnicodeSet is initialized and ready to use.
  34 UBool gEmptyUnicodeSetInitialized = FALSE;
  35
  36 inline UnicodeSet* getImpl(Key key) {
  37     UnicodeSet* candidate = gUnicodeSets[key];
  38     if (candidate == nullptr) {
  39         return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
  40     }
  41     return candidate;
  42 }
  43
  44 UnicodeSet* computeUnion(Key k1, Key k2) {
  45     UnicodeSet* result = new UnicodeSet();
  46     if (result == nullptr) {
  47         return nullptr;
  48     }
  49     result->addAll(*getImpl(k1));
  50     result->addAll(*getImpl(k2));
  51     result->freeze();
  52     return result;
  53 }
  54
  55 UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
  56     UnicodeSet* result = new UnicodeSet();
  57     if (result == nullptr) {
  58         return nullptr;
  59     }
  60     result->addAll(*getImpl(k1));
  61     result->addAll(*getImpl(k2));
  62     result->addAll(*getImpl(k3));
  63     result->freeze();
  64     return result;
  65 }
  66
  67
  68 void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
  69     // assert unicodeSets.get(key) == null;
  70     gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
  71 }
  72
  73 class ParseDataSink : public ResourceSink {
  74   public:
  75     void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE {
  76         ResourceTable contextsTable = value.getTable(status);
  77         if (U_FAILURE(status)) { return; }
  78         for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
  79             if (uprv_strcmp(key, "date") == 0) {
  80                 // ignore
  81             } else {
  82                 ResourceTable strictnessTable = value.getTable(status);
  83                 if (U_FAILURE(status)) { return; }
  84                 for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
  85                     bool isLenient = (uprv_strcmp(key, "lenient") == 0);
  86                     ResourceArray array = value.getArray(status);
  87                     if (U_FAILURE(status)) { return; }
  88                     for (int k = 0; k < array.getSize(); k++) {
  89                         array.getValue(k, value);
  90                         UnicodeString str = value.getUnicodeString(status);
  91                         if (U_FAILURE(status)) { return; }
  92                         // There is both lenient and strict data for comma/period,
  93                         // but not for any of the other symbols.
  94                         if (str.indexOf(u'.') != -1) {
  95                             saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
  96                         } else if (str.indexOf(u',') != -1) {
  97                             saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
  98                         } else if (str.indexOf(u'+') != -1) {
  99                             saveSet(PLUS_SIGN, str, status);
 100                         } else if (str.indexOf(u'-') != -1) {
 101                             saveSet(MINUS_SIGN, str, status);
 102                         } else if (str.indexOf(u'$') != -1) {
 103                             saveSet(DOLLAR_SIGN, str, status);
 104                         } else if (str.indexOf(u'£') != -1) {
 105                             saveSet(POUND_SIGN, str, status);
 106                         } else if (str.indexOf(u'₹') != -1) {
 107                             saveSet(RUPEE_SIGN, str, status);
 108                         } else if (str.indexOf(u'¥') != -1) {
 109                             saveSet(YEN_SIGN, str, status);
 110                         } else if (str.indexOf(u'₩') != -1) {
 111                             saveSet(WON_SIGN, str, status);
 112                         } else if (str.indexOf(u'%') != -1) {
 113                             saveSet(PERCENT_SIGN, str, status);
 114                         } else if (str.indexOf(u'‰') != -1) {
 115                             saveSet(PERMILLE_SIGN, str, status);
 116                         } else if (str.indexOf(u'’') != -1) {
 117                             saveSet(APOSTROPHE_SIGN, str, status);
 118                         } else {
 119                             // Unknown class of parse lenients
 120                             // TODO(ICU-20428): Make ICU automatically accept new classes?
 121                             U_ASSERT(FALSE);
 122                         }
 123                         if (U_FAILURE(status)) { return; }
 124                     }
 125                 }
 126             }
 127         }
 128     }
 129 };
 130
 131
 132 icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
 133
 134 UBool U_CALLCONV cleanupNumberParseUniSets() {
 135     if (gEmptyUnicodeSetInitialized) {
 136         reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
 137         gEmptyUnicodeSetInitialized = FALSE;
 138     }
 139     for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
 140         delete gUnicodeSets[i];
 141         gUnicodeSets[i] = nullptr;
 142     }
 143     gNumberParseUniSetsInitOnce.reset();
 144     return TRUE;
 145 }
 146
 147 void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
 148     ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
 149
 150     // Initialize the empty instance for well-defined fallback behavior
 151     new(gEmptyUnicodeSet) UnicodeSet();
 152     reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
 153     gEmptyUnicodeSetInitialized = TRUE;
 154
 155     // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
 156     // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
 157     gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
 158             u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
 159     gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
 160
 161     LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
 162     if (U_FAILURE(status)) { return; }
 163     ParseDataSink sink;
 164     ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
 165     if (U_FAILURE(status)) { return; }
 166
 167     // NOTE: It is OK for these assertions to fail if there was a no-data build.
 168     U_ASSERT(gUnicodeSets[COMMA] != nullptr);
 169     U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
 170     U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
 171     U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
 172     U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);
 173
 174     LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(
 175         u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
 176         status
 177     ), status);
 178     if (U_FAILURE(status)) { return; }
 179     otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
 180     gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
 181     gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
 182     gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
 183             STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
 184
 185     U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
 186     U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
 187     U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
 188     U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);
 189
 190     gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
 191     if (U_FAILURE(status)) { return; }
 192
 193     U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
 194     U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
 195     U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
 196     U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
 197     U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);
 198
 199     gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
 200     if (U_FAILURE(status)) { return; }
 201     gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
 202     gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
 203
 204     for (auto* uniset : gUnicodeSets) {
 205         if (uniset != nullptr) {
 206             uniset->freeze();
 207         }
 208     }
 209 }
 210
 211 }
 212
 213 const UnicodeSet* unisets::get(Key key) {
 214     UErrorCode localStatus = U_ZERO_ERROR;
 215     umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
 216     if (U_FAILURE(localStatus)) {
 217         return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
 218     }
 219     return getImpl(key);
 220 }
 221
 222 Key unisets::chooseFrom(UnicodeString str, Key key1) {
 223     return get(key1)->contains(str) ? key1 : NONE;
 224 }
 225
 226 Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
 227     return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
 228 }
 229
 230 //Key unisets::chooseCurrency(UnicodeString str) {
 231 //    if (get(DOLLAR_SIGN)->contains(str)) {
 232 //        return DOLLAR_SIGN;
 233 //    } else if (get(POUND_SIGN)->contains(str)) {
 234 //        return POUND_SIGN;
 235 //    } else if (get(RUPEE_SIGN)->contains(str)) {
 236 //        return RUPEE_SIGN;
 237 //    } else if (get(YEN_SIGN)->contains(str)) {
 238 //        return YEN_SIGN;
 239 //    } else {
 240 //        return NONE;
 241 //    }
 242 //}
 243
 244
 245 #endif /* #if !UCONFIG_NO_FORMATTING */