]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/numparse_decimal.cpp
ICU-62123.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / numparse_decimal.cpp
1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include "unicode/utypes.h"
5
6 #if !UCONFIG_NO_FORMATTING
7
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11
12 #include "numparse_types.h"
13 #include "numparse_decimal.h"
14 #include "static_unicode_sets.h"
15 #include "numparse_utils.h"
16 #include "unicode/uchar.h"
17 #include "putilimp.h"
18 #include "number_decimalquantity.h"
19
20 using namespace icu;
21 using namespace icu::numparse;
22 using namespace icu::numparse::impl;
23
24
25 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
26 parse_flags_t parseFlags) {
27 if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
28 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
29 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
30 } else {
31 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
32 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
33 }
34 bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
35 unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
36 : unisets::ALL_SEPARATORS;
37
38 // Attempt to find separators in the static cache
39
40 groupingUniSet = unisets::get(groupingKey);
41 unisets::Key decimalKey = unisets::chooseFrom(
42 decimalSeparator,
43 strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
44 strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
45 if (decimalKey >= 0) {
46 decimalUniSet = unisets::get(decimalKey);
47 } else if (!decimalSeparator.isEmpty()) {
48 auto* set = new UnicodeSet();
49 set->add(decimalSeparator.char32At(0));
50 set->freeze();
51 decimalUniSet = set;
52 fLocalDecimalUniSet.adoptInstead(set);
53 } else {
54 decimalUniSet = unisets::get(unisets::EMPTY);
55 }
56
57 if (groupingKey >= 0 && decimalKey >= 0) {
58 // Everything is available in the static cache
59 separatorSet = groupingUniSet;
60 leadSet = unisets::get(
61 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
62 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
63 } else {
64 auto* set = new UnicodeSet();
65 set->addAll(*groupingUniSet);
66 set->addAll(*decimalUniSet);
67 set->freeze();
68 separatorSet = set;
69 fLocalSeparatorSet.adoptInstead(set);
70 leadSet = nullptr;
71 }
72
73 UChar32 cpZero = symbols.getCodePointZero();
74 if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
75 // Uncommon case: okay to allocate.
76 auto digitStrings = new UnicodeString[10];
77 fLocalDigitStrings.adoptInstead(digitStrings);
78 for (int32_t i = 0; i <= 9; i++) {
79 digitStrings[i] = symbols.getConstDigitSymbol(i);
80 }
81 }
82
83 requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
84 groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
85 integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
86 grouping1 = grouper.getPrimary();
87 grouping2 = grouper.getSecondary();
88
89 // Fraction grouping parsing is disabled for now but could be enabled later.
90 // See http://bugs.icu-project.org/trac/ticket/10794
91 // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
92 }
93
94 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
95 return match(segment, result, 0, status);
96 }
97
98 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
99 UErrorCode&) const {
100 if (result.seenNumber() && exponentSign == 0) {
101 // A number has already been consumed.
102 return false;
103 } else if (exponentSign != 0) {
104 // scientific notation always comes after the number
105 U_ASSERT(!result.quantity.bogus);
106 }
107
108 // Initial offset before any character consumption.
109 int32_t initialOffset = segment.getOffset();
110
111 // Return value: whether to ask for more characters.
112 bool maybeMore = false;
113
114 // All digits consumed so far.
115 number::impl::DecimalQuantity digitsConsumed;
116 digitsConsumed.bogus = true;
117
118 // The total number of digits after the decimal place, used for scaling the result.
119 int32_t digitsAfterDecimalPlace = 0;
120
121 // The actual grouping and decimal separators used in the string.
122 // If non-null, we have seen that token.
123 UnicodeString actualGroupingString;
124 UnicodeString actualDecimalString;
125 actualGroupingString.setToBogus();
126 actualDecimalString.setToBogus();
127
128 // Information for two groups: the previous group and the current group.
129 //
130 // Each group has three pieces of information:
131 //
132 // Offset: the string position of the beginning of the group, including a leading separator
133 // if there was a leading separator. This is needed in case we need to rewind the parse to
134 // that position.
135 //
136 // Separator type:
137 // 0 => beginning of string
138 // 1 => lead separator is a grouping separator
139 // 2 => lead separator is a decimal separator
140 //
141 // Count: the number of digits in the group. If -1, the group has been validated.
142 int32_t currGroupOffset = 0;
143 int32_t currGroupSepType = 0;
144 int32_t currGroupCount = 0;
145 int32_t prevGroupOffset = -1;
146 int32_t prevGroupSepType = -1;
147 int32_t prevGroupCount = -1;
148
149 while (segment.length() > 0) {
150 maybeMore = false;
151
152 // Attempt to match a digit.
153 int8_t digit = -1;
154
155 // Try by code point digit value.
156 UChar32 cp = segment.getCodePoint();
157 if (u_isdigit(cp)) {
158 segment.adjustOffset(U16_LENGTH(cp));
159 digit = static_cast<int8_t>(u_digit(cp, 10));
160 }
161
162 // Try by digit string.
163 if (digit == -1 && !fLocalDigitStrings.isNull()) {
164 for (int32_t i = 0; i < 10; i++) {
165 const UnicodeString& str = fLocalDigitStrings[i];
166 if (str.isEmpty()) {
167 continue;
168 }
169 int32_t overlap = segment.getCommonPrefixLength(str);
170 if (overlap == str.length()) {
171 segment.adjustOffset(overlap);
172 digit = static_cast<int8_t>(i);
173 break;
174 }
175 maybeMore = maybeMore || (overlap == segment.length());
176 }
177 }
178
179 if (digit >= 0) {
180 // Digit was found.
181 if (digitsConsumed.bogus) {
182 digitsConsumed.bogus = false;
183 digitsConsumed.clear();
184 }
185 digitsConsumed.appendDigit(digit, 0, true);
186 currGroupCount++;
187 if (!actualDecimalString.isBogus()) {
188 digitsAfterDecimalPlace++;
189 }
190 continue;
191 }
192
193 // Attempt to match a literal grouping or decimal separator.
194 bool isDecimal = false;
195 bool isGrouping = false;
196
197 // 1) Attempt the decimal separator string literal.
198 // if (we have not seen a decimal separator yet) { ... }
199 if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
200 int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
201 maybeMore = maybeMore || (overlap == segment.length());
202 if (overlap == decimalSeparator.length()) {
203 isDecimal = true;
204 actualDecimalString = decimalSeparator;
205 }
206 }
207
208 // 2) Attempt to match the actual grouping string literal.
209 if (!actualGroupingString.isBogus()) {
210 int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
211 maybeMore = maybeMore || (overlap == segment.length());
212 if (overlap == actualGroupingString.length()) {
213 isGrouping = true;
214 }
215 }
216
217 // 2.5) Attempt to match a new the grouping separator string literal.
218 // if (we have not seen a grouping or decimal separator yet) { ... }
219 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
220 !groupingSeparator.isEmpty()) {
221 int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
222 maybeMore = maybeMore || (overlap == segment.length());
223 if (overlap == groupingSeparator.length()) {
224 isGrouping = true;
225 actualGroupingString = groupingSeparator;
226 }
227 }
228
229 // 3) Attempt to match a decimal separator from the equivalence set.
230 // if (we have not seen a decimal separator yet) { ... }
231 // The !isGrouping is to confirm that we haven't yet matched the current character.
232 if (!isGrouping && actualDecimalString.isBogus()) {
233 if (decimalUniSet->contains(cp)) {
234 isDecimal = true;
235 actualDecimalString = UnicodeString(cp);
236 }
237 }
238
239 // 4) Attempt to match a grouping separator from the equivalence set.
240 // if (we have not seen a grouping or decimal separator yet) { ... }
241 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
242 if (groupingUniSet->contains(cp)) {
243 isGrouping = true;
244 actualGroupingString = UnicodeString(cp);
245 }
246 }
247
248 // Leave if we failed to match this as a separator.
249 if (!isDecimal && !isGrouping) {
250 break;
251 }
252
253 // Check for conditions when we don't want to accept the separator.
254 if (isDecimal && integerOnly) {
255 break;
256 } else if (currGroupSepType == 2 && isGrouping) {
257 // Fraction grouping
258 break;
259 }
260
261 // Validate intermediate grouping sizes.
262 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
263 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
264 if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
265 // Invalid grouping sizes.
266 if (isGrouping && currGroupCount == 0) {
267 // Trailing grouping separators: these are taken care of below
268 U_ASSERT(currGroupSepType == 1);
269 } else if (requireGroupingMatch) {
270 // Strict mode: reject the parse
271 digitsConsumed.clear();
272 digitsConsumed.bogus = true;
273 }
274 break;
275 } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
276 break;
277 } else {
278 // Grouping sizes OK so far.
279 prevGroupOffset = currGroupOffset;
280 prevGroupCount = currGroupCount;
281 if (isDecimal) {
282 // Do not validate this group any more.
283 prevGroupSepType = -1;
284 } else {
285 prevGroupSepType = currGroupSepType;
286 }
287 }
288
289 // OK to accept the separator.
290 // Special case: don't update currGroup if it is empty; this allows two grouping
291 // separators in a row in lenient mode.
292 if (currGroupCount != 0) {
293 currGroupOffset = segment.getOffset();
294 }
295 currGroupSepType = isGrouping ? 1 : 2;
296 currGroupCount = 0;
297 if (isGrouping) {
298 segment.adjustOffset(actualGroupingString.length());
299 } else {
300 segment.adjustOffset(actualDecimalString.length());
301 }
302 }
303
304 // End of main loop.
305 // Back up if there was a trailing grouping separator.
306 // Shift prev -> curr so we can check it as a final group.
307 if (currGroupSepType != 2 && currGroupCount == 0) {
308 maybeMore = true;
309 segment.setOffset(currGroupOffset);
310 currGroupOffset = prevGroupOffset;
311 currGroupSepType = prevGroupSepType;
312 currGroupCount = prevGroupCount;
313 prevGroupOffset = -1;
314 prevGroupSepType = 0;
315 prevGroupCount = 1;
316 }
317
318 // Validate final grouping sizes.
319 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
320 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
321 if (!requireGroupingMatch) {
322 // The cases we need to handle here are lone digits.
323 // Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
324 // See more examples in numberformattestspecification.txt
325 int32_t digitsToRemove = 0;
326 if (!prevValidSecondary) {
327 segment.setOffset(prevGroupOffset);
328 digitsToRemove += prevGroupCount;
329 digitsToRemove += currGroupCount;
330 } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
331 maybeMore = true;
332 segment.setOffset(currGroupOffset);
333 digitsToRemove += currGroupCount;
334 }
335 if (digitsToRemove != 0) {
336 digitsConsumed.adjustMagnitude(-digitsToRemove);
337 digitsConsumed.truncate();
338 }
339 prevValidSecondary = true;
340 currValidPrimary = true;
341 }
342 if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
343 // Grouping failure.
344 digitsConsumed.bogus = true;
345 }
346
347 // Strings that start with a separator but have no digits,
348 // or strings that failed a grouping size check.
349 if (digitsConsumed.bogus) {
350 maybeMore = maybeMore || (segment.length() == 0);
351 segment.setOffset(initialOffset);
352 return maybeMore;
353 }
354
355 // We passed all inspections. Start post-processing.
356
357 // Adjust for fraction part.
358 digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
359
360 // Set the digits, either normal or exponent.
361 if (exponentSign != 0 && segment.getOffset() != initialOffset) {
362 bool overflow = false;
363 if (digitsConsumed.fitsInLong()) {
364 int64_t exponentLong = digitsConsumed.toLong(false);
365 U_ASSERT(exponentLong >= 0);
366 if (exponentLong <= INT32_MAX) {
367 auto exponentInt = static_cast<int32_t>(exponentLong);
368 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
369 overflow = true;
370 }
371 } else {
372 overflow = true;
373 }
374 } else {
375 overflow = true;
376 }
377 if (overflow) {
378 if (exponentSign == -1) {
379 // Set to zero
380 result.quantity.clear();
381 } else {
382 // Set to infinity
383 result.quantity.bogus = true;
384 result.flags |= FLAG_INFINITY;
385 }
386 }
387 } else {
388 result.quantity = digitsConsumed;
389 }
390
391 // Set other information into the result and return.
392 if (!actualDecimalString.isBogus()) {
393 result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
394 }
395 result.setCharsConsumed(segment);
396 return segment.length() == 0 || maybeMore;
397 }
398
399 bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
400 if (requireGroupingMatch) {
401 if (sepType == -1) {
402 // No such group (prevGroup before first shift).
403 return true;
404 } else if (sepType == 0) {
405 // First group.
406 if (isPrimary) {
407 // No grouping separators is OK.
408 return true;
409 } else {
410 return count != 0 && count <= grouping2;
411 }
412 } else if (sepType == 1) {
413 // Middle group.
414 if (isPrimary) {
415 return count == grouping1;
416 } else {
417 return count == grouping2;
418 }
419 } else {
420 U_ASSERT(sepType == 2);
421 // After the decimal separator.
422 return true;
423 }
424 } else {
425 if (sepType == 1) {
426 // #11230: don't accept middle groups with only 1 digit.
427 return count != 1;
428 } else {
429 return true;
430 }
431 }
432 }
433
434 bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
435 // The common case uses a static leadSet for efficiency.
436 if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
437 return segment.startsWith(*leadSet);
438 }
439 if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
440 return true;
441 }
442 if (fLocalDigitStrings.isNull()) {
443 return false;
444 }
445 for (int32_t i = 0; i < 10; i++) {
446 if (segment.startsWith(fLocalDigitStrings[i])) {
447 return true;
448 }
449 }
450 return false;
451 }
452
453 UnicodeString DecimalMatcher::toString() const {
454 return u"<Decimal>";
455 }
456
457
458 #endif /* #if !UCONFIG_NO_FORMATTING */