]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/numparse_decimal.cpp
ICU-64243.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / numparse_decimal.cpp
1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include "unicode/utypes.h"
5
6 #if !UCONFIG_NO_FORMATTING
7
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11
12 #include "numparse_types.h"
13 #include "numparse_decimal.h"
14 #include "static_unicode_sets.h"
15 #include "numparse_utils.h"
16 #include "unicode/uchar.h"
17 #include "putilimp.h"
18 #include "number_decimalquantity.h"
19
20 using namespace icu;
21 using namespace icu::numparse;
22 using namespace icu::numparse::impl;
23
24
25 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
26 parse_flags_t parseFlags) {
27 if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
28 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
29 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
30 } else {
31 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
32 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
33 }
34 bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
35 unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
36 : unisets::ALL_SEPARATORS;
37
38 // Attempt to find separators in the static cache
39
40 groupingUniSet = unisets::get(groupingKey);
41 unisets::Key decimalKey = unisets::chooseFrom(
42 decimalSeparator,
43 strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
44 strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
45 if (decimalKey >= 0) {
46 decimalUniSet = unisets::get(decimalKey);
47 } else if (!decimalSeparator.isEmpty()) {
48 auto* set = new UnicodeSet();
49 set->add(decimalSeparator.char32At(0));
50 set->freeze();
51 decimalUniSet = set;
52 fLocalDecimalUniSet.adoptInstead(set);
53 } else {
54 decimalUniSet = unisets::get(unisets::EMPTY);
55 }
56
57 if (groupingKey >= 0 && decimalKey >= 0) {
58 // Everything is available in the static cache
59 separatorSet = groupingUniSet;
60 leadSet = unisets::get(
61 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
62 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
63 } else {
64 auto* set = new UnicodeSet();
65 set->addAll(*groupingUniSet);
66 set->addAll(*decimalUniSet);
67 set->freeze();
68 separatorSet = set;
69 fLocalSeparatorSet.adoptInstead(set);
70 leadSet = nullptr;
71 }
72
73 UChar32 cpZero = symbols.getCodePointZero();
74 if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
75 // Uncommon case: okay to allocate.
76 auto digitStrings = new UnicodeString[10];
77 fLocalDigitStrings.adoptInstead(digitStrings);
78 for (int32_t i = 0; i <= 9; i++) {
79 digitStrings[i] = symbols.getConstDigitSymbol(i);
80 }
81 }
82
83 requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
84 groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
85 integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
86 grouping1 = grouper.getPrimary();
87 grouping2 = grouper.getSecondary();
88
89 // Fraction grouping parsing is disabled for now but could be enabled later.
90 // See http://bugs.icu-project.org/trac/ticket/10794
91 // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
92 }
93
94 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
95 return match(segment, result, 0, status);
96 }
97
98 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
99 UErrorCode&) const {
100 if (result.seenNumber() && exponentSign == 0) {
101 // A number has already been consumed.
102 return false;
103 } else if (exponentSign != 0) {
104 // scientific notation always comes after the number
105 U_ASSERT(!result.quantity.bogus);
106 }
107
108 // Initial offset before any character consumption.
109 int32_t initialOffset = segment.getOffset();
110
111 // Return value: whether to ask for more characters.
112 bool maybeMore = false;
113
114 // All digits consumed so far.
115 number::impl::DecimalQuantity digitsConsumed;
116 digitsConsumed.bogus = true;
117
118 // The total number of digits after the decimal place, used for scaling the result.
119 int32_t digitsAfterDecimalPlace = 0;
120
121 // The actual grouping and decimal separators used in the string.
122 // If non-null, we have seen that token.
123 UnicodeString actualGroupingString;
124 UnicodeString actualDecimalString;
125 actualGroupingString.setToBogus();
126 actualDecimalString.setToBogus();
127
128 // Information for two groups: the previous group and the current group.
129 //
130 // Each group has three pieces of information:
131 //
132 // Offset: the string position of the beginning of the group, including a leading separator
133 // if there was a leading separator. This is needed in case we need to rewind the parse to
134 // that position.
135 //
136 // Separator type:
137 // 0 => beginning of string
138 // 1 => lead separator is a grouping separator
139 // 2 => lead separator is a decimal separator
140 //
141 // Count: the number of digits in the group. If -1, the group has been validated.
142 int32_t currGroupOffset = 0;
143 int32_t currGroupSepType = 0;
144 int32_t currGroupCount = 0;
145 int32_t prevGroupOffset = -1;
146 int32_t prevGroupSepType = -1;
147 int32_t prevGroupCount = -1;
148
149 while (segment.length() > 0) {
150 maybeMore = false;
151
152 // Attempt to match a digit.
153 int8_t digit = -1;
154
155 // Try by code point digit value.
156 UChar32 cp = segment.getCodePoint();
157 if (u_isdigit(cp)) {
158 segment.adjustOffset(U16_LENGTH(cp));
159 digit = static_cast<int8_t>(u_digit(cp, 10));
160 }
161
162 // Try by digit string.
163 if (digit == -1 && !fLocalDigitStrings.isNull()) {
164 for (int32_t i = 0; i < 10; i++) {
165 const UnicodeString& str = fLocalDigitStrings[i];
166 if (str.isEmpty()) {
167 continue;
168 }
169 // The following test is Apple-specific, for <rdar://7632623>;
170 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
171 int32_t overlap = (segment.startsWith(0x96F6) && fLocalDigitStrings[0].charAt(0)==0x3007)?
172 1: segment.getCommonPrefixLength(str);
173 if (overlap == str.length()) {
174 segment.adjustOffset(overlap);
175 digit = static_cast<int8_t>(i);
176 break;
177 }
178 maybeMore = maybeMore || (overlap == segment.length());
179 }
180 }
181
182 if (digit >= 0) {
183 // Digit was found.
184 if (digitsConsumed.bogus) {
185 digitsConsumed.bogus = false;
186 digitsConsumed.clear();
187 }
188 digitsConsumed.appendDigit(digit, 0, true);
189 currGroupCount++;
190 if (!actualDecimalString.isBogus()) {
191 digitsAfterDecimalPlace++;
192 }
193 continue;
194 }
195
196 // Attempt to match a literal grouping or decimal separator.
197 bool isDecimal = false;
198 bool isGrouping = false;
199
200 // 1) Attempt the decimal separator string literal.
201 // if (we have not seen a decimal separator yet) { ... }
202 if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
203 int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
204 maybeMore = maybeMore || (overlap == segment.length());
205 if (overlap == decimalSeparator.length()) {
206 isDecimal = true;
207 actualDecimalString = decimalSeparator;
208 }
209 }
210
211 // 2) Attempt to match the actual grouping string literal.
212 if (!actualGroupingString.isBogus()) {
213 int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
214 maybeMore = maybeMore || (overlap == segment.length());
215 if (overlap == actualGroupingString.length()) {
216 isGrouping = true;
217 }
218 }
219
220 // 2.5) Attempt to match a new the grouping separator string literal.
221 // if (we have not seen a grouping or decimal separator yet) { ... }
222 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
223 !groupingSeparator.isEmpty()) {
224 int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
225 maybeMore = maybeMore || (overlap == segment.length());
226 if (overlap == groupingSeparator.length()) {
227 isGrouping = true;
228 actualGroupingString = groupingSeparator;
229 }
230 }
231
232 // 3) Attempt to match a decimal separator from the equivalence set.
233 // if (we have not seen a decimal separator yet) { ... }
234 // The !isGrouping is to confirm that we haven't yet matched the current character.
235 if (!isGrouping && actualDecimalString.isBogus()) {
236 if (decimalUniSet->contains(cp)) {
237 isDecimal = true;
238 actualDecimalString = UnicodeString(cp);
239 }
240 }
241
242 // 4) Attempt to match a grouping separator from the equivalence set.
243 // if (we have not seen a grouping or decimal separator yet) { ... }
244 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
245 if (groupingUniSet->contains(cp)) {
246 isGrouping = true;
247 actualGroupingString = UnicodeString(cp);
248 }
249 }
250
251 // Leave if we failed to match this as a separator.
252 if (!isDecimal && !isGrouping) {
253 break;
254 }
255
256 // Check for conditions when we don't want to accept the separator.
257 if (isDecimal && integerOnly) {
258 break;
259 } else if (currGroupSepType == 2 && isGrouping) {
260 // Fraction grouping
261 break;
262 }
263
264 // Validate intermediate grouping sizes.
265 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
266 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
267 if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
268 // Invalid grouping sizes.
269 if (isGrouping && currGroupCount == 0) {
270 // Trailing grouping separators: these are taken care of below
271 U_ASSERT(currGroupSepType == 1);
272 } else if (requireGroupingMatch) {
273 // Strict mode: reject the parse
274 digitsConsumed.clear();
275 digitsConsumed.bogus = true;
276 }
277 break;
278 } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
279 break;
280 } else {
281 // Grouping sizes OK so far.
282 prevGroupOffset = currGroupOffset;
283 prevGroupCount = currGroupCount;
284 if (isDecimal) {
285 // Do not validate this group any more.
286 prevGroupSepType = -1;
287 } else {
288 prevGroupSepType = currGroupSepType;
289 }
290 }
291
292 // OK to accept the separator.
293 // Special case: don't update currGroup if it is empty; this allows two grouping
294 // separators in a row in lenient mode.
295 if (currGroupCount != 0) {
296 currGroupOffset = segment.getOffset();
297 }
298 currGroupSepType = isGrouping ? 1 : 2;
299 currGroupCount = 0;
300 if (isGrouping) {
301 segment.adjustOffset(actualGroupingString.length());
302 } else {
303 segment.adjustOffset(actualDecimalString.length());
304 }
305 }
306
307 // End of main loop.
308 // Back up if there was a trailing grouping separator.
309 // Shift prev -> curr so we can check it as a final group.
310 if (currGroupSepType != 2 && currGroupCount == 0) {
311 maybeMore = true;
312 segment.setOffset(currGroupOffset);
313 currGroupOffset = prevGroupOffset;
314 currGroupSepType = prevGroupSepType;
315 currGroupCount = prevGroupCount;
316 prevGroupOffset = -1;
317 prevGroupSepType = 0;
318 prevGroupCount = 1;
319 }
320
321 // Validate final grouping sizes.
322 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
323 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
324 if (!requireGroupingMatch) {
325 // The cases we need to handle here are lone digits.
326 // Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
327 // See more examples in numberformattestspecification.txt
328 int32_t digitsToRemove = 0;
329 if (!prevValidSecondary) {
330 segment.setOffset(prevGroupOffset);
331 digitsToRemove += prevGroupCount;
332 digitsToRemove += currGroupCount;
333 } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
334 maybeMore = true;
335 segment.setOffset(currGroupOffset);
336 digitsToRemove += currGroupCount;
337 }
338 if (digitsToRemove != 0) {
339 digitsConsumed.adjustMagnitude(-digitsToRemove);
340 digitsConsumed.truncate();
341 }
342 prevValidSecondary = true;
343 currValidPrimary = true;
344 }
345 if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
346 // Grouping failure.
347 digitsConsumed.bogus = true;
348 }
349
350 // Strings that start with a separator but have no digits,
351 // or strings that failed a grouping size check.
352 if (digitsConsumed.bogus) {
353 maybeMore = maybeMore || (segment.length() == 0);
354 segment.setOffset(initialOffset);
355 return maybeMore;
356 }
357
358 // We passed all inspections. Start post-processing.
359
360 // Adjust for fraction part.
361 digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
362
363 // Set the digits, either normal or exponent.
364 if (exponentSign != 0 && segment.getOffset() != initialOffset) {
365 bool overflow = false;
366 if (digitsConsumed.fitsInLong()) {
367 int64_t exponentLong = digitsConsumed.toLong(false);
368 U_ASSERT(exponentLong >= 0);
369 if (exponentLong <= INT32_MAX) {
370 auto exponentInt = static_cast<int32_t>(exponentLong);
371 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
372 overflow = true;
373 }
374 } else {
375 overflow = true;
376 }
377 } else {
378 overflow = true;
379 }
380 if (overflow) {
381 if (exponentSign == -1) {
382 // Set to zero
383 result.quantity.clear();
384 } else {
385 // Set to infinity
386 result.quantity.bogus = true;
387 result.flags |= FLAG_INFINITY;
388 }
389 }
390 } else {
391 result.quantity = digitsConsumed;
392 }
393
394 // Set other information into the result and return.
395 if (!actualDecimalString.isBogus()) {
396 result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
397 }
398 result.setCharsConsumed(segment);
399 return segment.length() == 0 || maybeMore;
400 }
401
402 bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
403 if (requireGroupingMatch) {
404 if (sepType == -1) {
405 // No such group (prevGroup before first shift).
406 return true;
407 } else if (sepType == 0) {
408 // First group.
409 if (isPrimary) {
410 // No grouping separators is OK.
411 return true;
412 } else {
413 // return count != 0 && count <= grouping2;
414 return count <= grouping2; // Apple <rdar://problem/38565910>, allow initial secondary group of 0
415 }
416 } else if (sepType == 1) {
417 // Middle group.
418 if (isPrimary) {
419 return count == grouping1;
420 } else {
421 return count == grouping2;
422 }
423 } else {
424 U_ASSERT(sepType == 2);
425 // After the decimal separator.
426 return true;
427 }
428 } else {
429 if (sepType == 1) {
430 // #11230: don't accept middle groups with only 1 digit.
431 return count != 1;
432 } else {
433 return true;
434 }
435 }
436 }
437
438 bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
439 // The common case uses a static leadSet for efficiency.
440 if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
441 return segment.startsWith(*leadSet);
442 }
443 if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
444 return true;
445 }
446 if (fLocalDigitStrings.isNull()) {
447 return false;
448 }
449 // The following test is Apple-specific, for <rdar://7632623>;
450 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
451 if (segment.startsWith(0x96F6) && fLocalDigitStrings[0].length()==1 && fLocalDigitStrings[0].charAt(0)==0x3007) {
452 return true;
453 }
454 for (int32_t i = 0; i < 10; i++) {
455 if (segment.startsWith(fLocalDigitStrings[i])) {
456 return true;
457 }
458 }
459 return false;
460 }
461
462 UnicodeString DecimalMatcher::toString() const {
463 return u"<Decimal>";
464 }
465
466
467 #endif /* #if !UCONFIG_NO_FORMATTING */