]>
Commit | Line | Data |
---|---|---|
0f5d89e8 A |
1 | // © 2018 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | #include "unicode/utypes.h" | |
5 | ||
6 | #if !UCONFIG_NO_FORMATTING | |
7 | ||
8 | // Allow implicit conversion from char16_t* to UnicodeString for this file: | |
9 | // Helpful in toString methods and elsewhere. | |
10 | #define UNISTR_FROM_STRING_EXPLICIT | |
11 | ||
12 | #include <typeinfo> | |
13 | #include <array> | |
14 | #include "number_types.h" | |
15 | #include "number_patternstring.h" | |
16 | #include "numparse_types.h" | |
17 | #include "numparse_impl.h" | |
18 | #include "numparse_symbols.h" | |
19 | #include "numparse_decimal.h" | |
20 | #include "unicode/numberformatter.h" | |
21 | #include "cstr.h" | |
22 | #include "number_mapper.h" | |
23 | #include "static_unicode_sets.h" | |
24 | ||
25 | using namespace icu; | |
26 | using namespace icu::number; | |
27 | using namespace icu::number::impl; | |
28 | using namespace icu::numparse; | |
29 | using namespace icu::numparse::impl; | |
30 | ||
31 | ||
32 | NumberParseMatcher::~NumberParseMatcher() = default; | |
33 | ||
34 | ||
35 | NumberParserImpl* | |
36 | NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& patternString, | |
37 | parse_flags_t parseFlags, UErrorCode& status) { | |
38 | ||
39 | LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags)); | |
40 | DecimalFormatSymbols symbols(locale, status); | |
41 | ||
42 | parser->fLocalMatchers.ignorables = {unisets::DEFAULT_IGNORABLES}; | |
43 | IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables; | |
44 | ||
45 | DecimalFormatSymbols dfs(locale, status); | |
46 | dfs.setSymbol(DecimalFormatSymbols::kCurrencySymbol, u"IU$"); | |
47 | dfs.setSymbol(DecimalFormatSymbols::kIntlCurrencySymbol, u"ICU"); | |
48 | CurrencySymbols currencySymbols({u"ICU", status}, locale, dfs, status); | |
49 | ||
50 | ParsedPatternInfo patternInfo; | |
51 | PatternParser::parseToPatternInfo(patternString, patternInfo, status); | |
52 | ||
53 | // The following statements set up the affix matchers. | |
54 | AffixTokenMatcherSetupData affixSetupData = { | |
55 | currencySymbols, symbols, ignorables, locale, parseFlags}; | |
56 | parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData}; | |
57 | parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse}; | |
58 | parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers( | |
59 | patternInfo, *parser, ignorables, parseFlags, status); | |
60 | ||
61 | Grouper grouper = Grouper::forStrategy(UNUM_GROUPING_AUTO); | |
62 | grouper.setLocaleData(patternInfo, locale); | |
63 | ||
64 | parser->addMatcher(parser->fLocalMatchers.ignorables); | |
65 | parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags}); | |
66 | parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false}); | |
67 | parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false}); | |
68 | parser->addMatcher(parser->fLocalMatchers.percent = {symbols}); | |
69 | parser->addMatcher(parser->fLocalMatchers.permille = {symbols}); | |
70 | parser->addMatcher(parser->fLocalMatchers.nan = {symbols}); | |
71 | parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); | |
72 | parser->addMatcher(parser->fLocalMatchers.padding = {u"@"}); | |
73 | parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper}); | |
74 | parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status}); | |
3d1f044b | 75 | parser->addMatcher(parser->fLocalValidators.number = {}); |
0f5d89e8 A |
76 | |
77 | parser->freeze(); | |
78 | return parser.orphan(); | |
79 | } | |
80 | ||
81 | NumberParserImpl* | |
82 | NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatProperties& properties, | |
83 | const DecimalFormatSymbols& symbols, bool parseCurrency, | |
84 | UErrorCode& status) { | |
85 | Locale locale = symbols.getLocale(); | |
86 | PropertiesAffixPatternProvider localPAPP; | |
87 | CurrencyPluralInfoAffixProvider localCPIAP; | |
88 | AffixPatternProvider* affixProvider; | |
89 | if (properties.currencyPluralInfo.fPtr.isNull()) { | |
90 | localPAPP.setTo(properties, status); | |
91 | affixProvider = &localPAPP; | |
92 | } else { | |
93 | localCPIAP.setTo(*properties.currencyPluralInfo.fPtr, properties, status); | |
94 | affixProvider = &localCPIAP; | |
95 | } | |
96 | if (affixProvider == nullptr || U_FAILURE(status)) { return nullptr; } | |
97 | CurrencyUnit currency = resolveCurrency(properties, locale, status); | |
98 | CurrencySymbols currencySymbols(currency, locale, symbols, status); | |
99 | bool isStrict = properties.parseMode.getOrDefault(PARSE_MODE_STRICT) == PARSE_MODE_STRICT; | |
100 | Grouper grouper = Grouper::forProperties(properties); | |
101 | int parseFlags = 0; | |
102 | if (affixProvider == nullptr || U_FAILURE(status)) { return nullptr; } | |
103 | if (!properties.parseCaseSensitive) { | |
104 | parseFlags |= PARSE_FLAG_IGNORE_CASE; | |
105 | } | |
106 | if (properties.parseIntegerOnly) { | |
107 | parseFlags |= PARSE_FLAG_INTEGER_ONLY; | |
108 | } | |
109 | if (properties.signAlwaysShown) { | |
110 | parseFlags |= PARSE_FLAG_PLUS_SIGN_ALLOWED; | |
111 | } | |
112 | if (isStrict) { | |
113 | parseFlags |= PARSE_FLAG_STRICT_GROUPING_SIZE; | |
114 | parseFlags |= PARSE_FLAG_STRICT_SEPARATORS; | |
115 | parseFlags |= PARSE_FLAG_USE_FULL_AFFIXES; | |
116 | parseFlags |= PARSE_FLAG_EXACT_AFFIX; | |
117 | } else { | |
118 | parseFlags |= PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES; | |
119 | } | |
120 | if (grouper.getPrimary() <= 0) { | |
121 | parseFlags |= PARSE_FLAG_GROUPING_DISABLED; | |
122 | } | |
123 | if (parseCurrency || affixProvider->hasCurrencySign()) { | |
124 | parseFlags |= PARSE_FLAG_MONETARY_SEPARATORS; | |
c5116b9f A |
125 | // Apple <rdar://problem/51938595> check for curr symbol in suffix; use affix API instead? |
126 | if (properties.positiveSuffixPattern.indexOf(u'¤') >= 0) { | |
127 | parseFlags |= PARSE_FLAG_HAS_TRAIL_CURRENCY; | |
128 | } | |
0f5d89e8 A |
129 | } |
130 | if (!parseCurrency) { | |
131 | parseFlags |= PARSE_FLAG_NO_FOREIGN_CURRENCY; | |
132 | } | |
133 | ||
134 | LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags)); | |
135 | ||
136 | parser->fLocalMatchers.ignorables = { | |
137 | isStrict ? unisets::STRICT_IGNORABLES : unisets::DEFAULT_IGNORABLES}; | |
138 | IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables; | |
139 | ||
140 | ////////////////////// | |
141 | /// AFFIX MATCHERS /// | |
142 | ////////////////////// | |
143 | ||
144 | // The following statements set up the affix matchers. | |
145 | AffixTokenMatcherSetupData affixSetupData = { | |
146 | currencySymbols, symbols, ignorables, locale, parseFlags}; | |
147 | parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData}; | |
148 | parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse}; | |
149 | parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers( | |
150 | *affixProvider, *parser, ignorables, parseFlags, status); | |
151 | ||
152 | //////////////////////// | |
153 | /// CURRENCY MATCHER /// | |
154 | //////////////////////// | |
155 | ||
156 | if (parseCurrency || affixProvider->hasCurrencySign()) { | |
157 | parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status}); | |
158 | } | |
159 | ||
160 | /////////////// | |
161 | /// PERCENT /// | |
162 | /////////////// | |
163 | ||
164 | // ICU-TC meeting, April 11, 2018: accept percent/permille only if it is in the pattern, | |
165 | // and to maintain regressive behavior, divide by 100 even if no percent sign is present. | |
3d1f044b | 166 | if (!isStrict && affixProvider->containsSymbolType(AffixPatternType::TYPE_PERCENT, status)) { |
0f5d89e8 A |
167 | parser->addMatcher(parser->fLocalMatchers.percent = {symbols}); |
168 | } | |
3d1f044b | 169 | if (!isStrict && affixProvider->containsSymbolType(AffixPatternType::TYPE_PERMILLE, status)) { |
0f5d89e8 A |
170 | parser->addMatcher(parser->fLocalMatchers.permille = {symbols}); |
171 | } | |
172 | ||
173 | /////////////////////////////// | |
174 | /// OTHER STANDARD MATCHERS /// | |
175 | /////////////////////////////// | |
176 | ||
177 | if (!isStrict) { | |
178 | parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false}); | |
179 | parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false}); | |
180 | } | |
181 | parser->addMatcher(parser->fLocalMatchers.nan = {symbols}); | |
182 | parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); | |
183 | UnicodeString padString = properties.padString; | |
184 | if (!padString.isBogus() && !ignorables.getSet()->contains(padString)) { | |
185 | parser->addMatcher(parser->fLocalMatchers.padding = {padString}); | |
186 | } | |
187 | parser->addMatcher(parser->fLocalMatchers.ignorables); | |
188 | parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags}); | |
189 | // NOTE: parseNoExponent doesn't disable scientific parsing if we have a scientific formatter | |
190 | if (!properties.parseNoExponent || properties.minimumExponentDigits > 0) { | |
191 | parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper}); | |
192 | } | |
193 | ||
194 | ////////////////// | |
195 | /// VALIDATORS /// | |
196 | ////////////////// | |
197 | ||
198 | parser->addMatcher(parser->fLocalValidators.number = {}); | |
199 | if (isStrict) { | |
200 | parser->addMatcher(parser->fLocalValidators.affix = {}); | |
201 | } | |
202 | if (parseCurrency) { | |
203 | parser->addMatcher(parser->fLocalValidators.currency = {}); | |
204 | } | |
205 | if (properties.decimalPatternMatchRequired) { | |
206 | bool patternHasDecimalSeparator = | |
207 | properties.decimalSeparatorAlwaysShown || properties.maximumFractionDigits != 0; | |
208 | parser->addMatcher(parser->fLocalValidators.decimalSeparator = {patternHasDecimalSeparator}); | |
209 | } | |
210 | // The multiplier takes care of scaling percentages. | |
211 | Scale multiplier = scaleFromProperties(properties); | |
212 | if (multiplier.isValid()) { | |
213 | parser->addMatcher(parser->fLocalValidators.multiplier = {multiplier}); | |
214 | } | |
215 | ||
216 | parser->freeze(); | |
217 | return parser.orphan(); | |
218 | } | |
219 | ||
220 | NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags) | |
221 | : fParseFlags(parseFlags) { | |
222 | } | |
223 | ||
224 | NumberParserImpl::~NumberParserImpl() { | |
225 | fNumMatchers = 0; | |
226 | } | |
227 | ||
228 | void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) { | |
229 | if (fNumMatchers + 1 > fMatchers.getCapacity()) { | |
230 | fMatchers.resize(fNumMatchers * 2, fNumMatchers); | |
231 | } | |
232 | fMatchers[fNumMatchers] = &matcher; | |
233 | fNumMatchers++; | |
234 | } | |
235 | ||
236 | void NumberParserImpl::freeze() { | |
237 | fFrozen = true; | |
238 | } | |
239 | ||
240 | parse_flags_t NumberParserImpl::getParseFlags() const { | |
241 | return fParseFlags; | |
242 | } | |
243 | ||
244 | void NumberParserImpl::parse(const UnicodeString& input, bool greedy, ParsedNumber& result, | |
245 | UErrorCode& status) const { | |
246 | return parse(input, 0, greedy, result, status); | |
247 | } | |
248 | ||
249 | void NumberParserImpl::parse(const UnicodeString& input, int32_t start, bool greedy, ParsedNumber& result, | |
250 | UErrorCode& status) const { | |
251 | if (U_FAILURE(status)) { | |
252 | return; | |
253 | } | |
254 | U_ASSERT(fFrozen); | |
255 | // TODO: Check start >= 0 and start < input.length() | |
256 | StringSegment segment(input, 0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)); | |
257 | segment.adjustOffset(start); | |
258 | if (greedy) { | |
3d1f044b A |
259 | parseGreedy(segment, result, status); |
260 | } else if (0 != (fParseFlags & PARSE_FLAG_ALLOW_INFINITE_RECURSION)) { | |
261 | // Start at 1 so that recursionLevels never gets to 0 | |
262 | parseLongestRecursive(segment, result, 1, status); | |
0f5d89e8 | 263 | } else { |
3d1f044b A |
264 | // Arbitrary recursion safety limit: 100 levels. |
265 | parseLongestRecursive(segment, result, -100, status); | |
0f5d89e8 A |
266 | } |
267 | for (int32_t i = 0; i < fNumMatchers; i++) { | |
268 | fMatchers[i]->postProcess(result); | |
269 | } | |
270 | result.postProcess(); | |
271 | } | |
272 | ||
3d1f044b | 273 | void NumberParserImpl::parseGreedy(StringSegment& segment, ParsedNumber& result, |
0f5d89e8 | 274 | UErrorCode& status) const { |
3d1f044b | 275 | // Note: this method is not recursive in order to avoid stack overflow. |
c5116b9f | 276 | bool extraLoop = FALSE; |
3d1f044b A |
277 | for (int i = 0; i <fNumMatchers;) { |
278 | // Base Case | |
279 | if (segment.length() == 0) { | |
c5116b9f A |
280 | if ((extraLoop && i==0) || (fParseFlags & PARSE_FLAG_HAS_TRAIL_CURRENCY) == 0 || result.currencyCode[0] != 0) { // Apple <rdar://problem/51938595> |
281 | return; | |
282 | } | |
283 | // If we are parsing for currency expected at the end but have not found it yet, | |
284 | // allow one more loop to see if we are matching an empty currency symbol | |
285 | if (extraLoop) { | |
286 | extraLoop = TRUE; // Apple <rdar://problem/51938595> | |
287 | } | |
3d1f044b | 288 | } |
0f5d89e8 A |
289 | const NumberParseMatcher* matcher = fMatchers[i]; |
290 | if (!matcher->smokeTest(segment)) { | |
3d1f044b A |
291 | // Matcher failed smoke test: try the next one |
292 | i++; | |
0f5d89e8 A |
293 | continue; |
294 | } | |
3d1f044b | 295 | int32_t initialOffset = segment.getOffset(); |
0f5d89e8 A |
296 | matcher->match(segment, result, status); |
297 | if (U_FAILURE(status)) { | |
298 | return; | |
299 | } | |
c5116b9f A |
300 | if (segment.getOffset() != initialOffset || |
301 | (extraLoop && result.currencyCode[0] != 0)) { // Apple <rdar://problem/51938595> | |
3d1f044b A |
302 | // Greedy heuristic: accept the match and loop back |
303 | i = 0; | |
304 | continue; | |
305 | } else { | |
306 | // Matcher did not match: try the next one | |
307 | i++; | |
308 | continue; | |
0f5d89e8 | 309 | } |
3d1f044b | 310 | UPRV_UNREACHABLE; |
0f5d89e8 A |
311 | } |
312 | ||
313 | // NOTE: If we get here, the greedy parse completed without consuming the entire string. | |
314 | } | |
315 | ||
316 | void NumberParserImpl::parseLongestRecursive(StringSegment& segment, ParsedNumber& result, | |
3d1f044b | 317 | int32_t recursionLevels, |
0f5d89e8 A |
318 | UErrorCode& status) const { |
319 | // Base Case | |
320 | if (segment.length() == 0) { | |
321 | return; | |
322 | } | |
323 | ||
3d1f044b A |
324 | // Safety against stack overflow |
325 | if (recursionLevels == 0) { | |
326 | return; | |
327 | } | |
328 | ||
0f5d89e8 A |
329 | // TODO: Give a nice way for the matcher to reset the ParsedNumber? |
330 | ParsedNumber initial(result); | |
331 | ParsedNumber candidate; | |
332 | ||
333 | int initialOffset = segment.getOffset(); | |
334 | for (int32_t i = 0; i < fNumMatchers; i++) { | |
335 | const NumberParseMatcher* matcher = fMatchers[i]; | |
336 | if (!matcher->smokeTest(segment)) { | |
337 | continue; | |
338 | } | |
339 | ||
340 | // In a non-greedy parse, we attempt all possible matches and pick the best. | |
341 | for (int32_t charsToConsume = 0; charsToConsume < segment.length();) { | |
342 | charsToConsume += U16_LENGTH(segment.codePointAt(charsToConsume)); | |
343 | ||
344 | // Run the matcher on a segment of the current length. | |
345 | candidate = initial; | |
346 | segment.setLength(charsToConsume); | |
347 | bool maybeMore = matcher->match(segment, candidate, status); | |
348 | segment.resetLength(); | |
349 | if (U_FAILURE(status)) { | |
350 | return; | |
351 | } | |
352 | ||
353 | // If the entire segment was consumed, recurse. | |
354 | if (segment.getOffset() - initialOffset == charsToConsume) { | |
3d1f044b | 355 | parseLongestRecursive(segment, candidate, recursionLevels + 1, status); |
0f5d89e8 A |
356 | if (U_FAILURE(status)) { |
357 | return; | |
358 | } | |
359 | if (candidate.isBetterThan(result)) { | |
360 | result = candidate; | |
361 | } | |
362 | } | |
363 | ||
364 | // Since the segment can be re-used, reset the offset. | |
365 | // This does not have an effect if the matcher did not consume any chars. | |
366 | segment.setOffset(initialOffset); | |
367 | ||
368 | // Unless the matcher wants to see the next char, continue to the next matcher. | |
369 | if (!maybeMore) { | |
370 | break; | |
371 | } | |
372 | } | |
373 | } | |
374 | } | |
375 | ||
376 | UnicodeString NumberParserImpl::toString() const { | |
377 | UnicodeString result(u"<NumberParserImpl matchers:["); | |
378 | for (int32_t i = 0; i < fNumMatchers; i++) { | |
379 | result.append(u' '); | |
380 | result.append(fMatchers[i]->toString()); | |
381 | } | |
382 | result.append(u" ]>", -1); | |
383 | return result; | |
384 | } | |
385 | ||
386 | ||
387 | #endif /* #if !UCONFIG_NO_FORMATTING */ |