1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 #include "unicode/utypes.h"
6 #if !UCONFIG_NO_FORMATTING
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
12 #include "numparse_types.h"
13 #include "numparse_decimal.h"
14 #include "static_unicode_sets.h"
15 #include "numparse_utils.h"
16 #include "unicode/uchar.h"
18 #include "number_decimalquantity.h"
21 using namespace icu::numparse
;
22 using namespace icu::numparse::impl
;
25 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols
& symbols
, const Grouper
& grouper
,
26 parse_flags_t parseFlags
) {
27 if (0 != (parseFlags
& PARSE_FLAG_MONETARY_SEPARATORS
)) {
28 groupingSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol
);
29 decimalSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol
);
31 groupingSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol
);
32 decimalSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol
);
34 bool strictSeparators
= 0 != (parseFlags
& PARSE_FLAG_STRICT_SEPARATORS
);
35 unisets::Key groupingKey
= strictSeparators
? unisets::STRICT_ALL_SEPARATORS
36 : unisets::ALL_SEPARATORS
;
38 // Attempt to find separators in the static cache
40 groupingUniSet
= unisets::get(groupingKey
);
41 unisets::Key decimalKey
= unisets::chooseFrom(
43 strictSeparators
? unisets::STRICT_COMMA
: unisets::COMMA
,
44 strictSeparators
? unisets::STRICT_PERIOD
: unisets::PERIOD
);
45 if (decimalKey
>= 0) {
46 decimalUniSet
= unisets::get(decimalKey
);
47 } else if (!decimalSeparator
.isEmpty()) {
48 auto* set
= new UnicodeSet();
49 set
->add(decimalSeparator
.char32At(0));
52 fLocalDecimalUniSet
.adoptInstead(set
);
54 decimalUniSet
= unisets::get(unisets::EMPTY
);
57 if (groupingKey
>= 0 && decimalKey
>= 0) {
58 // Everything is available in the static cache
59 separatorSet
= groupingUniSet
;
60 leadSet
= unisets::get(
61 strictSeparators
? unisets::DIGITS_OR_ALL_SEPARATORS
62 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS
);
64 auto* set
= new UnicodeSet();
65 set
->addAll(*groupingUniSet
);
66 set
->addAll(*decimalUniSet
);
69 fLocalSeparatorSet
.adoptInstead(set
);
73 UChar32 cpZero
= symbols
.getCodePointZero();
74 if (cpZero
== -1 || !u_isdigit(cpZero
) || u_digit(cpZero
, 10) != 0) {
75 // Uncommon case: okay to allocate.
76 auto digitStrings
= new UnicodeString
[10];
77 fLocalDigitStrings
.adoptInstead(digitStrings
);
78 for (int32_t i
= 0; i
<= 9; i
++) {
79 digitStrings
[i
] = symbols
.getConstDigitSymbol(i
);
83 requireGroupingMatch
= 0 != (parseFlags
& PARSE_FLAG_STRICT_GROUPING_SIZE
);
84 groupingDisabled
= 0 != (parseFlags
& PARSE_FLAG_GROUPING_DISABLED
);
85 integerOnly
= 0 != (parseFlags
& PARSE_FLAG_INTEGER_ONLY
);
86 grouping1
= grouper
.getPrimary();
87 grouping2
= grouper
.getSecondary();
89 // Fraction grouping parsing is disabled for now but could be enabled later.
90 // See http://bugs.icu-project.org/trac/ticket/10794
91 // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
94 bool DecimalMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, UErrorCode
& status
) const {
95 return match(segment
, result
, 0, status
);
98 bool DecimalMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, int8_t exponentSign
,
100 if (result
.seenNumber() && exponentSign
== 0) {
101 // A number has already been consumed.
103 } else if (exponentSign
!= 0) {
104 // scientific notation always comes after the number
105 U_ASSERT(!result
.quantity
.bogus
);
108 // Initial offset before any character consumption.
109 int32_t initialOffset
= segment
.getOffset();
111 // Return value: whether to ask for more characters.
112 bool maybeMore
= false;
114 // All digits consumed so far.
115 number::impl::DecimalQuantity digitsConsumed
;
116 digitsConsumed
.bogus
= true;
118 // The total number of digits after the decimal place, used for scaling the result.
119 int32_t digitsAfterDecimalPlace
= 0;
121 // The actual grouping and decimal separators used in the string.
122 // If non-null, we have seen that token.
123 UnicodeString actualGroupingString
;
124 UnicodeString actualDecimalString
;
125 actualGroupingString
.setToBogus();
126 actualDecimalString
.setToBogus();
128 // Information for two groups: the previous group and the current group.
130 // Each group has three pieces of information:
132 // Offset: the string position of the beginning of the group, including a leading separator
133 // if there was a leading separator. This is needed in case we need to rewind the parse to
137 // 0 => beginning of string
138 // 1 => lead separator is a grouping separator
139 // 2 => lead separator is a decimal separator
141 // Count: the number of digits in the group. If -1, the group has been validated.
142 int32_t currGroupOffset
= 0;
143 int32_t currGroupSepType
= 0;
144 int32_t currGroupCount
= 0;
145 int32_t prevGroupOffset
= -1;
146 int32_t prevGroupSepType
= -1;
147 int32_t prevGroupCount
= -1;
149 while (segment
.length() > 0) {
152 // Attempt to match a digit.
155 // Try by code point digit value.
156 UChar32 cp
= segment
.getCodePoint();
158 segment
.adjustOffset(U16_LENGTH(cp
));
159 digit
= static_cast<int8_t>(u_digit(cp
, 10));
162 // Try by digit string.
163 if (digit
== -1 && !fLocalDigitStrings
.isNull()) {
164 for (int32_t i
= 0; i
< 10; i
++) {
165 const UnicodeString
& str
= fLocalDigitStrings
[i
];
169 // The following test is Apple-specific, for <rdar://7632623>;
170 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
171 int32_t overlap
= (segment
.startsWith(0x96F6) && fLocalDigitStrings
[0].charAt(0)==0x3007)?
172 1: segment
.getCommonPrefixLength(str
);
173 if (overlap
== str
.length()) {
174 segment
.adjustOffset(overlap
);
175 digit
= static_cast<int8_t>(i
);
178 maybeMore
= maybeMore
|| (overlap
== segment
.length());
184 if (digitsConsumed
.bogus
) {
185 digitsConsumed
.bogus
= false;
186 digitsConsumed
.clear();
188 digitsConsumed
.appendDigit(digit
, 0, true);
190 if (!actualDecimalString
.isBogus()) {
191 digitsAfterDecimalPlace
++;
196 // Attempt to match a literal grouping or decimal separator.
197 bool isDecimal
= false;
198 bool isGrouping
= false;
200 // 1) Attempt the decimal separator string literal.
201 // if (we have not seen a decimal separator yet) { ... }
202 if (actualDecimalString
.isBogus() && !decimalSeparator
.isEmpty()) {
203 int32_t overlap
= segment
.getCommonPrefixLength(decimalSeparator
);
204 maybeMore
= maybeMore
|| (overlap
== segment
.length());
205 if (overlap
== decimalSeparator
.length()) {
207 actualDecimalString
= decimalSeparator
;
211 // 2) Attempt to match the actual grouping string literal.
212 if (!actualGroupingString
.isBogus()) {
213 int32_t overlap
= segment
.getCommonPrefixLength(actualGroupingString
);
214 maybeMore
= maybeMore
|| (overlap
== segment
.length());
215 if (overlap
== actualGroupingString
.length()) {
220 // 2.5) Attempt to match a new the grouping separator string literal.
221 // if (we have not seen a grouping or decimal separator yet) { ... }
222 if (!groupingDisabled
&& actualGroupingString
.isBogus() && actualDecimalString
.isBogus() &&
223 !groupingSeparator
.isEmpty()) {
224 int32_t overlap
= segment
.getCommonPrefixLength(groupingSeparator
);
225 maybeMore
= maybeMore
|| (overlap
== segment
.length());
226 if (overlap
== groupingSeparator
.length()) {
228 actualGroupingString
= groupingSeparator
;
232 // 3) Attempt to match a decimal separator from the equivalence set.
233 // if (we have not seen a decimal separator yet) { ... }
234 // The !isGrouping is to confirm that we haven't yet matched the current character.
235 if (!isGrouping
&& actualDecimalString
.isBogus()) {
236 if (decimalUniSet
->contains(cp
)) {
238 actualDecimalString
= UnicodeString(cp
);
242 // 4) Attempt to match a grouping separator from the equivalence set.
243 // if (we have not seen a grouping or decimal separator yet) { ... }
244 if (!groupingDisabled
&& actualGroupingString
.isBogus() && actualDecimalString
.isBogus()) {
245 if (groupingUniSet
->contains(cp
)) {
247 actualGroupingString
= UnicodeString(cp
);
251 // Leave if we failed to match this as a separator.
252 if (!isDecimal
&& !isGrouping
) {
256 // Check for conditions when we don't want to accept the separator.
257 if (isDecimal
&& integerOnly
) {
259 } else if (currGroupSepType
== 2 && isGrouping
) {
264 // Validate intermediate grouping sizes.
265 bool prevValidSecondary
= validateGroup(prevGroupSepType
, prevGroupCount
, false);
266 bool currValidPrimary
= validateGroup(currGroupSepType
, currGroupCount
, true);
267 if (!prevValidSecondary
|| (isDecimal
&& !currValidPrimary
)) {
268 // Invalid grouping sizes.
269 if (isGrouping
&& currGroupCount
== 0) {
270 // Trailing grouping separators: these are taken care of below
271 U_ASSERT(currGroupSepType
== 1);
272 } else if (requireGroupingMatch
) {
273 // Strict mode: reject the parse
274 digitsConsumed
.clear();
275 digitsConsumed
.bogus
= true;
278 } else if (requireGroupingMatch
&& currGroupCount
== 0 && currGroupSepType
== 1) {
281 // Grouping sizes OK so far.
282 prevGroupOffset
= currGroupOffset
;
283 prevGroupCount
= currGroupCount
;
285 // Do not validate this group any more.
286 prevGroupSepType
= -1;
288 prevGroupSepType
= currGroupSepType
;
292 // OK to accept the separator.
293 // Special case: don't update currGroup if it is empty; this allows two grouping
294 // separators in a row in lenient mode.
295 if (currGroupCount
!= 0) {
296 currGroupOffset
= segment
.getOffset();
298 currGroupSepType
= isGrouping
? 1 : 2;
301 segment
.adjustOffset(actualGroupingString
.length());
303 segment
.adjustOffset(actualDecimalString
.length());
308 // Back up if there was a trailing grouping separator.
309 // Shift prev -> curr so we can check it as a final group.
310 if (currGroupSepType
!= 2 && currGroupCount
== 0) {
312 segment
.setOffset(currGroupOffset
);
313 currGroupOffset
= prevGroupOffset
;
314 currGroupSepType
= prevGroupSepType
;
315 currGroupCount
= prevGroupCount
;
316 prevGroupOffset
= -1;
317 prevGroupSepType
= 0;
321 // Validate final grouping sizes.
322 bool prevValidSecondary
= validateGroup(prevGroupSepType
, prevGroupCount
, false);
323 bool currValidPrimary
= validateGroup(currGroupSepType
, currGroupCount
, true);
324 if (!requireGroupingMatch
) {
325 // The cases we need to handle here are lone digits.
326 // Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
327 // See more examples in numberformattestspecification.txt
328 int32_t digitsToRemove
= 0;
329 if (!prevValidSecondary
) {
330 segment
.setOffset(prevGroupOffset
);
331 digitsToRemove
+= prevGroupCount
;
332 digitsToRemove
+= currGroupCount
;
333 } else if (!currValidPrimary
&& (prevGroupSepType
!= 0 || prevGroupCount
!= 0)) {
335 segment
.setOffset(currGroupOffset
);
336 digitsToRemove
+= currGroupCount
;
338 if (digitsToRemove
!= 0) {
339 digitsConsumed
.adjustMagnitude(-digitsToRemove
);
340 digitsConsumed
.truncate();
342 prevValidSecondary
= true;
343 currValidPrimary
= true;
345 if (currGroupSepType
!= 2 && (!prevValidSecondary
|| !currValidPrimary
)) {
347 digitsConsumed
.bogus
= true;
350 // Strings that start with a separator but have no digits,
351 // or strings that failed a grouping size check.
352 if (digitsConsumed
.bogus
) {
353 maybeMore
= maybeMore
|| (segment
.length() == 0);
354 segment
.setOffset(initialOffset
);
358 // We passed all inspections. Start post-processing.
360 // Adjust for fraction part.
361 digitsConsumed
.adjustMagnitude(-digitsAfterDecimalPlace
);
363 // Set the digits, either normal or exponent.
364 if (exponentSign
!= 0 && segment
.getOffset() != initialOffset
) {
365 bool overflow
= false;
366 if (digitsConsumed
.fitsInLong()) {
367 int64_t exponentLong
= digitsConsumed
.toLong(false);
368 U_ASSERT(exponentLong
>= 0);
369 if (exponentLong
<= INT32_MAX
) {
370 auto exponentInt
= static_cast<int32_t>(exponentLong
);
371 if (result
.quantity
.adjustMagnitude(exponentSign
* exponentInt
)) {
381 if (exponentSign
== -1) {
383 result
.quantity
.clear();
386 result
.quantity
.bogus
= true;
387 result
.flags
|= FLAG_INFINITY
;
391 result
.quantity
= digitsConsumed
;
394 // Set other information into the result and return.
395 if (!actualDecimalString
.isBogus()) {
396 result
.flags
|= FLAG_HAS_DECIMAL_SEPARATOR
;
398 result
.setCharsConsumed(segment
);
399 return segment
.length() == 0 || maybeMore
;
402 bool DecimalMatcher::validateGroup(int32_t sepType
, int32_t count
, bool isPrimary
) const {
403 if (requireGroupingMatch
) {
405 // No such group (prevGroup before first shift).
407 } else if (sepType
== 0) {
410 // No grouping separators is OK.
413 // return count != 0 && count <= grouping2;
414 return count
<= grouping2
; // Apple <rdar://problem/38565910>, allow initial secondary group of 0
416 } else if (sepType
== 1) {
419 return count
== grouping1
;
421 return count
== grouping2
;
424 U_ASSERT(sepType
== 2);
425 // After the decimal separator.
430 // #11230: don't accept middle groups with only 1 digit.
438 bool DecimalMatcher::smokeTest(const StringSegment
& segment
) const {
439 // The common case uses a static leadSet for efficiency.
440 if (fLocalDigitStrings
.isNull() && leadSet
!= nullptr) {
441 return segment
.startsWith(*leadSet
);
443 if (segment
.startsWith(*separatorSet
) || u_isdigit(segment
.getCodePoint())) {
446 if (fLocalDigitStrings
.isNull()) {
449 // The following test is Apple-specific, for <rdar://7632623>;
450 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
451 if (segment
.startsWith(0x96F6) && fLocalDigitStrings
[0].length()==1 && fLocalDigitStrings
[0].charAt(0)==0x3007) {
454 for (int32_t i
= 0; i
< 10; i
++) {
455 if (segment
.startsWith(fLocalDigitStrings
[i
])) {
462 UnicodeString
DecimalMatcher::toString() const {
467 #endif /* #if !UCONFIG_NO_FORMATTING */