]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/numparse_decimal.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / i18n / numparse_decimal.cpp
CommitLineData
0f5d89e8
A
1// © 2018 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4#include "unicode/utypes.h"
5
6#if !UCONFIG_NO_FORMATTING
7
8// Allow implicit conversion from char16_t* to UnicodeString for this file:
9// Helpful in toString methods and elsewhere.
10#define UNISTR_FROM_STRING_EXPLICIT
11
12#include "numparse_types.h"
13#include "numparse_decimal.h"
14#include "static_unicode_sets.h"
15#include "numparse_utils.h"
16#include "unicode/uchar.h"
17#include "putilimp.h"
18#include "number_decimalquantity.h"
340931cb 19#include "string_segment.h"
0f5d89e8
A
20
21using namespace icu;
22using namespace icu::numparse;
23using namespace icu::numparse::impl;
24
25
26DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
27 parse_flags_t parseFlags) {
28 if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
29 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
30 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
31 } else {
32 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
33 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
34 }
35 bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
36 unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
37 : unisets::ALL_SEPARATORS;
38
39 // Attempt to find separators in the static cache
40
41 groupingUniSet = unisets::get(groupingKey);
42 unisets::Key decimalKey = unisets::chooseFrom(
43 decimalSeparator,
44 strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
45 strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
46 if (decimalKey >= 0) {
47 decimalUniSet = unisets::get(decimalKey);
48 } else if (!decimalSeparator.isEmpty()) {
49 auto* set = new UnicodeSet();
50 set->add(decimalSeparator.char32At(0));
51 set->freeze();
52 decimalUniSet = set;
53 fLocalDecimalUniSet.adoptInstead(set);
54 } else {
55 decimalUniSet = unisets::get(unisets::EMPTY);
56 }
57
58 if (groupingKey >= 0 && decimalKey >= 0) {
59 // Everything is available in the static cache
60 separatorSet = groupingUniSet;
61 leadSet = unisets::get(
62 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
63 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
64 } else {
65 auto* set = new UnicodeSet();
66 set->addAll(*groupingUniSet);
67 set->addAll(*decimalUniSet);
68 set->freeze();
69 separatorSet = set;
70 fLocalSeparatorSet.adoptInstead(set);
71 leadSet = nullptr;
72 }
73
74 UChar32 cpZero = symbols.getCodePointZero();
75 if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
76 // Uncommon case: okay to allocate.
77 auto digitStrings = new UnicodeString[10];
78 fLocalDigitStrings.adoptInstead(digitStrings);
79 for (int32_t i = 0; i <= 9; i++) {
80 digitStrings[i] = symbols.getConstDigitSymbol(i);
81 }
82 }
83
84 requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
85 groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
86 integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
87 grouping1 = grouper.getPrimary();
88 grouping2 = grouper.getSecondary();
89
90 // Fraction grouping parsing is disabled for now but could be enabled later.
91 // See http://bugs.icu-project.org/trac/ticket/10794
92 // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
93}
94
95bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
96 return match(segment, result, 0, status);
97}
98
99bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
100 UErrorCode&) const {
101 if (result.seenNumber() && exponentSign == 0) {
102 // A number has already been consumed.
103 return false;
104 } else if (exponentSign != 0) {
105 // scientific notation always comes after the number
106 U_ASSERT(!result.quantity.bogus);
107 }
108
109 // Initial offset before any character consumption.
110 int32_t initialOffset = segment.getOffset();
111
112 // Return value: whether to ask for more characters.
113 bool maybeMore = false;
114
115 // All digits consumed so far.
116 number::impl::DecimalQuantity digitsConsumed;
117 digitsConsumed.bogus = true;
118
119 // The total number of digits after the decimal place, used for scaling the result.
120 int32_t digitsAfterDecimalPlace = 0;
121
122 // The actual grouping and decimal separators used in the string.
123 // If non-null, we have seen that token.
124 UnicodeString actualGroupingString;
125 UnicodeString actualDecimalString;
126 actualGroupingString.setToBogus();
127 actualDecimalString.setToBogus();
128
129 // Information for two groups: the previous group and the current group.
130 //
131 // Each group has three pieces of information:
132 //
133 // Offset: the string position of the beginning of the group, including a leading separator
134 // if there was a leading separator. This is needed in case we need to rewind the parse to
135 // that position.
136 //
137 // Separator type:
138 // 0 => beginning of string
139 // 1 => lead separator is a grouping separator
140 // 2 => lead separator is a decimal separator
141 //
142 // Count: the number of digits in the group. If -1, the group has been validated.
143 int32_t currGroupOffset = 0;
144 int32_t currGroupSepType = 0;
145 int32_t currGroupCount = 0;
146 int32_t prevGroupOffset = -1;
147 int32_t prevGroupSepType = -1;
148 int32_t prevGroupCount = -1;
149
150 while (segment.length() > 0) {
151 maybeMore = false;
152
153 // Attempt to match a digit.
154 int8_t digit = -1;
155
156 // Try by code point digit value.
157 UChar32 cp = segment.getCodePoint();
158 if (u_isdigit(cp)) {
159 segment.adjustOffset(U16_LENGTH(cp));
160 digit = static_cast<int8_t>(u_digit(cp, 10));
161 }
162
163 // Try by digit string.
164 if (digit == -1 && !fLocalDigitStrings.isNull()) {
165 for (int32_t i = 0; i < 10; i++) {
166 const UnicodeString& str = fLocalDigitStrings[i];
167 if (str.isEmpty()) {
168 continue;
169 }
3d1f044b
A
170 // The following test is Apple-specific, for <rdar://7632623>;
171 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
172 int32_t overlap = (segment.startsWith(0x96F6) && fLocalDigitStrings[0].charAt(0)==0x3007)?
173 1: segment.getCommonPrefixLength(str);
0f5d89e8
A
174 if (overlap == str.length()) {
175 segment.adjustOffset(overlap);
176 digit = static_cast<int8_t>(i);
177 break;
178 }
179 maybeMore = maybeMore || (overlap == segment.length());
180 }
181 }
182
183 if (digit >= 0) {
184 // Digit was found.
185 if (digitsConsumed.bogus) {
186 digitsConsumed.bogus = false;
187 digitsConsumed.clear();
188 }
189 digitsConsumed.appendDigit(digit, 0, true);
190 currGroupCount++;
191 if (!actualDecimalString.isBogus()) {
192 digitsAfterDecimalPlace++;
193 }
194 continue;
195 }
196
197 // Attempt to match a literal grouping or decimal separator.
198 bool isDecimal = false;
199 bool isGrouping = false;
200
201 // 1) Attempt the decimal separator string literal.
202 // if (we have not seen a decimal separator yet) { ... }
203 if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
204 int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
205 maybeMore = maybeMore || (overlap == segment.length());
206 if (overlap == decimalSeparator.length()) {
207 isDecimal = true;
208 actualDecimalString = decimalSeparator;
209 }
210 }
211
212 // 2) Attempt to match the actual grouping string literal.
213 if (!actualGroupingString.isBogus()) {
214 int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
215 maybeMore = maybeMore || (overlap == segment.length());
216 if (overlap == actualGroupingString.length()) {
217 isGrouping = true;
218 }
219 }
220
221 // 2.5) Attempt to match a new the grouping separator string literal.
222 // if (we have not seen a grouping or decimal separator yet) { ... }
223 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
224 !groupingSeparator.isEmpty()) {
225 int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
226 maybeMore = maybeMore || (overlap == segment.length());
227 if (overlap == groupingSeparator.length()) {
228 isGrouping = true;
229 actualGroupingString = groupingSeparator;
230 }
231 }
232
233 // 3) Attempt to match a decimal separator from the equivalence set.
234 // if (we have not seen a decimal separator yet) { ... }
235 // The !isGrouping is to confirm that we haven't yet matched the current character.
236 if (!isGrouping && actualDecimalString.isBogus()) {
237 if (decimalUniSet->contains(cp)) {
238 isDecimal = true;
239 actualDecimalString = UnicodeString(cp);
240 }
241 }
242
243 // 4) Attempt to match a grouping separator from the equivalence set.
244 // if (we have not seen a grouping or decimal separator yet) { ... }
245 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
246 if (groupingUniSet->contains(cp)) {
247 isGrouping = true;
248 actualGroupingString = UnicodeString(cp);
249 }
250 }
251
252 // Leave if we failed to match this as a separator.
253 if (!isDecimal && !isGrouping) {
254 break;
255 }
256
257 // Check for conditions when we don't want to accept the separator.
258 if (isDecimal && integerOnly) {
259 break;
260 } else if (currGroupSepType == 2 && isGrouping) {
261 // Fraction grouping
262 break;
263 }
264
265 // Validate intermediate grouping sizes.
266 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
267 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
268 if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
269 // Invalid grouping sizes.
270 if (isGrouping && currGroupCount == 0) {
271 // Trailing grouping separators: these are taken care of below
272 U_ASSERT(currGroupSepType == 1);
273 } else if (requireGroupingMatch) {
274 // Strict mode: reject the parse
275 digitsConsumed.clear();
276 digitsConsumed.bogus = true;
277 }
278 break;
279 } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
280 break;
281 } else {
282 // Grouping sizes OK so far.
283 prevGroupOffset = currGroupOffset;
284 prevGroupCount = currGroupCount;
285 if (isDecimal) {
286 // Do not validate this group any more.
287 prevGroupSepType = -1;
288 } else {
289 prevGroupSepType = currGroupSepType;
290 }
291 }
292
293 // OK to accept the separator.
294 // Special case: don't update currGroup if it is empty; this allows two grouping
295 // separators in a row in lenient mode.
296 if (currGroupCount != 0) {
297 currGroupOffset = segment.getOffset();
298 }
299 currGroupSepType = isGrouping ? 1 : 2;
300 currGroupCount = 0;
301 if (isGrouping) {
302 segment.adjustOffset(actualGroupingString.length());
303 } else {
304 segment.adjustOffset(actualDecimalString.length());
305 }
306 }
307
308 // End of main loop.
309 // Back up if there was a trailing grouping separator.
310 // Shift prev -> curr so we can check it as a final group.
311 if (currGroupSepType != 2 && currGroupCount == 0) {
312 maybeMore = true;
313 segment.setOffset(currGroupOffset);
314 currGroupOffset = prevGroupOffset;
315 currGroupSepType = prevGroupSepType;
316 currGroupCount = prevGroupCount;
317 prevGroupOffset = -1;
318 prevGroupSepType = 0;
319 prevGroupCount = 1;
320 }
321
322 // Validate final grouping sizes.
323 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
324 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
325 if (!requireGroupingMatch) {
326 // The cases we need to handle here are lone digits.
327 // Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
328 // See more examples in numberformattestspecification.txt
329 int32_t digitsToRemove = 0;
330 if (!prevValidSecondary) {
331 segment.setOffset(prevGroupOffset);
332 digitsToRemove += prevGroupCount;
333 digitsToRemove += currGroupCount;
334 } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
335 maybeMore = true;
336 segment.setOffset(currGroupOffset);
337 digitsToRemove += currGroupCount;
338 }
339 if (digitsToRemove != 0) {
340 digitsConsumed.adjustMagnitude(-digitsToRemove);
341 digitsConsumed.truncate();
342 }
343 prevValidSecondary = true;
344 currValidPrimary = true;
345 }
346 if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
347 // Grouping failure.
348 digitsConsumed.bogus = true;
349 }
350
351 // Strings that start with a separator but have no digits,
352 // or strings that failed a grouping size check.
353 if (digitsConsumed.bogus) {
354 maybeMore = maybeMore || (segment.length() == 0);
355 segment.setOffset(initialOffset);
356 return maybeMore;
357 }
358
359 // We passed all inspections. Start post-processing.
360
361 // Adjust for fraction part.
362 digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
363
364 // Set the digits, either normal or exponent.
365 if (exponentSign != 0 && segment.getOffset() != initialOffset) {
366 bool overflow = false;
367 if (digitsConsumed.fitsInLong()) {
368 int64_t exponentLong = digitsConsumed.toLong(false);
369 U_ASSERT(exponentLong >= 0);
370 if (exponentLong <= INT32_MAX) {
371 auto exponentInt = static_cast<int32_t>(exponentLong);
372 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
373 overflow = true;
374 }
375 } else {
376 overflow = true;
377 }
378 } else {
379 overflow = true;
380 }
381 if (overflow) {
382 if (exponentSign == -1) {
383 // Set to zero
384 result.quantity.clear();
385 } else {
386 // Set to infinity
387 result.quantity.bogus = true;
388 result.flags |= FLAG_INFINITY;
389 }
390 }
391 } else {
392 result.quantity = digitsConsumed;
393 }
394
395 // Set other information into the result and return.
396 if (!actualDecimalString.isBogus()) {
397 result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
398 }
399 result.setCharsConsumed(segment);
400 return segment.length() == 0 || maybeMore;
401}
402
403bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
404 if (requireGroupingMatch) {
405 if (sepType == -1) {
406 // No such group (prevGroup before first shift).
407 return true;
408 } else if (sepType == 0) {
409 // First group.
410 if (isPrimary) {
411 // No grouping separators is OK.
412 return true;
413 } else {
3d1f044b
A
414 // return count != 0 && count <= grouping2;
415 return count <= grouping2; // Apple <rdar://problem/38565910>, allow initial secondary group of 0
0f5d89e8
A
416 }
417 } else if (sepType == 1) {
418 // Middle group.
419 if (isPrimary) {
420 return count == grouping1;
421 } else {
422 return count == grouping2;
423 }
424 } else {
425 U_ASSERT(sepType == 2);
426 // After the decimal separator.
427 return true;
428 }
429 } else {
430 if (sepType == 1) {
431 // #11230: don't accept middle groups with only 1 digit.
432 return count != 1;
433 } else {
434 return true;
435 }
436 }
437}
438
439bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
440 // The common case uses a static leadSet for efficiency.
441 if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
442 return segment.startsWith(*leadSet);
443 }
444 if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
445 return true;
446 }
447 if (fLocalDigitStrings.isNull()) {
448 return false;
449 }
3d1f044b
A
450 // The following test is Apple-specific, for <rdar://7632623>;
451 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
452 if (segment.startsWith(0x96F6) && fLocalDigitStrings[0].length()==1 && fLocalDigitStrings[0].charAt(0)==0x3007) {
453 return true;
454 }
0f5d89e8
A
455 for (int32_t i = 0; i < 10; i++) {
456 if (segment.startsWith(fLocalDigitStrings[i])) {
457 return true;
458 }
459 }
460 return false;
461}
462
463UnicodeString DecimalMatcher::toString() const {
464 return u"<Decimal>";
465}
466
467
468#endif /* #if !UCONFIG_NO_FORMATTING */