1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 #include "unicode/utypes.h"
6 #if !UCONFIG_NO_FORMATTING
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
12 #include "numparse_types.h"
13 #include "numparse_decimal.h"
14 #include "static_unicode_sets.h"
15 #include "numparse_utils.h"
16 #include "unicode/uchar.h"
18 #include "number_decimalquantity.h"
21 using namespace icu::numparse
;
22 using namespace icu::numparse::impl
;
25 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols
& symbols
, const Grouper
& grouper
,
26 parse_flags_t parseFlags
) {
27 if (0 != (parseFlags
& PARSE_FLAG_MONETARY_SEPARATORS
)) {
28 groupingSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol
);
29 decimalSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol
);
31 groupingSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol
);
32 decimalSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol
);
34 bool strictSeparators
= 0 != (parseFlags
& PARSE_FLAG_STRICT_SEPARATORS
);
35 unisets::Key groupingKey
= strictSeparators
? unisets::STRICT_ALL_SEPARATORS
36 : unisets::ALL_SEPARATORS
;
38 // Attempt to find separators in the static cache
40 groupingUniSet
= unisets::get(groupingKey
);
41 unisets::Key decimalKey
= unisets::chooseFrom(
43 strictSeparators
? unisets::STRICT_COMMA
: unisets::COMMA
,
44 strictSeparators
? unisets::STRICT_PERIOD
: unisets::PERIOD
);
45 if (decimalKey
>= 0) {
46 decimalUniSet
= unisets::get(decimalKey
);
47 } else if (!decimalSeparator
.isEmpty()) {
48 auto* set
= new UnicodeSet();
49 set
->add(decimalSeparator
.char32At(0));
52 fLocalDecimalUniSet
.adoptInstead(set
);
54 decimalUniSet
= unisets::get(unisets::EMPTY
);
57 if (groupingKey
>= 0 && decimalKey
>= 0) {
58 // Everything is available in the static cache
59 separatorSet
= groupingUniSet
;
60 leadSet
= unisets::get(
61 strictSeparators
? unisets::DIGITS_OR_ALL_SEPARATORS
62 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS
);
64 auto* set
= new UnicodeSet();
65 set
->addAll(*groupingUniSet
);
66 set
->addAll(*decimalUniSet
);
69 fLocalSeparatorSet
.adoptInstead(set
);
73 UChar32 cpZero
= symbols
.getCodePointZero();
74 if (cpZero
== -1 || !u_isdigit(cpZero
) || u_digit(cpZero
, 10) != 0) {
75 // Uncommon case: okay to allocate.
76 auto digitStrings
= new UnicodeString
[10];
77 fLocalDigitStrings
.adoptInstead(digitStrings
);
78 for (int32_t i
= 0; i
<= 9; i
++) {
79 digitStrings
[i
] = symbols
.getConstDigitSymbol(i
);
83 requireGroupingMatch
= 0 != (parseFlags
& PARSE_FLAG_STRICT_GROUPING_SIZE
);
84 groupingDisabled
= 0 != (parseFlags
& PARSE_FLAG_GROUPING_DISABLED
);
85 integerOnly
= 0 != (parseFlags
& PARSE_FLAG_INTEGER_ONLY
);
86 grouping1
= grouper
.getPrimary();
87 grouping2
= grouper
.getSecondary();
89 // Fraction grouping parsing is disabled for now but could be enabled later.
90 // See http://bugs.icu-project.org/trac/ticket/10794
91 // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
94 bool DecimalMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, UErrorCode
& status
) const {
95 return match(segment
, result
, 0, status
);
98 bool DecimalMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, int8_t exponentSign
,
100 if (result
.seenNumber() && exponentSign
== 0) {
101 // A number has already been consumed.
103 } else if (exponentSign
!= 0) {
104 // scientific notation always comes after the number
105 U_ASSERT(!result
.quantity
.bogus
);
108 // Initial offset before any character consumption.
109 int32_t initialOffset
= segment
.getOffset();
111 // Return value: whether to ask for more characters.
112 bool maybeMore
= false;
114 // All digits consumed so far.
115 number::impl::DecimalQuantity digitsConsumed
;
116 digitsConsumed
.bogus
= true;
118 // The total number of digits after the decimal place, used for scaling the result.
119 int32_t digitsAfterDecimalPlace
= 0;
121 // The actual grouping and decimal separators used in the string.
122 // If non-null, we have seen that token.
123 UnicodeString actualGroupingString
;
124 UnicodeString actualDecimalString
;
125 actualGroupingString
.setToBogus();
126 actualDecimalString
.setToBogus();
128 // Information for two groups: the previous group and the current group.
130 // Each group has three pieces of information:
132 // Offset: the string position of the beginning of the group, including a leading separator
133 // if there was a leading separator. This is needed in case we need to rewind the parse to
137 // 0 => beginning of string
138 // 1 => lead separator is a grouping separator
139 // 2 => lead separator is a decimal separator
141 // Count: the number of digits in the group. If -1, the group has been validated.
142 int32_t currGroupOffset
= 0;
143 int32_t currGroupSepType
= 0;
144 int32_t currGroupCount
= 0;
145 int32_t prevGroupOffset
= -1;
146 int32_t prevGroupSepType
= -1;
147 int32_t prevGroupCount
= -1;
149 while (segment
.length() > 0) {
152 // Attempt to match a digit.
155 // Try by code point digit value.
156 UChar32 cp
= segment
.getCodePoint();
158 segment
.adjustOffset(U16_LENGTH(cp
));
159 digit
= static_cast<int8_t>(u_digit(cp
, 10));
162 // Try by digit string.
163 if (digit
== -1 && !fLocalDigitStrings
.isNull()) {
164 for (int32_t i
= 0; i
< 10; i
++) {
165 const UnicodeString
& str
= fLocalDigitStrings
[i
];
169 int32_t overlap
= segment
.getCommonPrefixLength(str
);
170 if (overlap
== str
.length()) {
171 segment
.adjustOffset(overlap
);
172 digit
= static_cast<int8_t>(i
);
175 maybeMore
= maybeMore
|| (overlap
== segment
.length());
181 if (digitsConsumed
.bogus
) {
182 digitsConsumed
.bogus
= false;
183 digitsConsumed
.clear();
185 digitsConsumed
.appendDigit(digit
, 0, true);
187 if (!actualDecimalString
.isBogus()) {
188 digitsAfterDecimalPlace
++;
193 // Attempt to match a literal grouping or decimal separator.
194 bool isDecimal
= false;
195 bool isGrouping
= false;
197 // 1) Attempt the decimal separator string literal.
198 // if (we have not seen a decimal separator yet) { ... }
199 if (actualDecimalString
.isBogus() && !decimalSeparator
.isEmpty()) {
200 int32_t overlap
= segment
.getCommonPrefixLength(decimalSeparator
);
201 maybeMore
= maybeMore
|| (overlap
== segment
.length());
202 if (overlap
== decimalSeparator
.length()) {
204 actualDecimalString
= decimalSeparator
;
208 // 2) Attempt to match the actual grouping string literal.
209 if (!actualGroupingString
.isBogus()) {
210 int32_t overlap
= segment
.getCommonPrefixLength(actualGroupingString
);
211 maybeMore
= maybeMore
|| (overlap
== segment
.length());
212 if (overlap
== actualGroupingString
.length()) {
217 // 2.5) Attempt to match a new the grouping separator string literal.
218 // if (we have not seen a grouping or decimal separator yet) { ... }
219 if (!groupingDisabled
&& actualGroupingString
.isBogus() && actualDecimalString
.isBogus() &&
220 !groupingSeparator
.isEmpty()) {
221 int32_t overlap
= segment
.getCommonPrefixLength(groupingSeparator
);
222 maybeMore
= maybeMore
|| (overlap
== segment
.length());
223 if (overlap
== groupingSeparator
.length()) {
225 actualGroupingString
= groupingSeparator
;
229 // 3) Attempt to match a decimal separator from the equivalence set.
230 // if (we have not seen a decimal separator yet) { ... }
231 // The !isGrouping is to confirm that we haven't yet matched the current character.
232 if (!isGrouping
&& actualDecimalString
.isBogus()) {
233 if (decimalUniSet
->contains(cp
)) {
235 actualDecimalString
= UnicodeString(cp
);
239 // 4) Attempt to match a grouping separator from the equivalence set.
240 // if (we have not seen a grouping or decimal separator yet) { ... }
241 if (!groupingDisabled
&& actualGroupingString
.isBogus() && actualDecimalString
.isBogus()) {
242 if (groupingUniSet
->contains(cp
)) {
244 actualGroupingString
= UnicodeString(cp
);
248 // Leave if we failed to match this as a separator.
249 if (!isDecimal
&& !isGrouping
) {
253 // Check for conditions when we don't want to accept the separator.
254 if (isDecimal
&& integerOnly
) {
256 } else if (currGroupSepType
== 2 && isGrouping
) {
261 // Validate intermediate grouping sizes.
262 bool prevValidSecondary
= validateGroup(prevGroupSepType
, prevGroupCount
, false);
263 bool currValidPrimary
= validateGroup(currGroupSepType
, currGroupCount
, true);
264 if (!prevValidSecondary
|| (isDecimal
&& !currValidPrimary
)) {
265 // Invalid grouping sizes.
266 if (isGrouping
&& currGroupCount
== 0) {
267 // Trailing grouping separators: these are taken care of below
268 U_ASSERT(currGroupSepType
== 1);
269 } else if (requireGroupingMatch
) {
270 // Strict mode: reject the parse
271 digitsConsumed
.clear();
272 digitsConsumed
.bogus
= true;
275 } else if (requireGroupingMatch
&& currGroupCount
== 0 && currGroupSepType
== 1) {
278 // Grouping sizes OK so far.
279 prevGroupOffset
= currGroupOffset
;
280 prevGroupCount
= currGroupCount
;
282 // Do not validate this group any more.
283 prevGroupSepType
= -1;
285 prevGroupSepType
= currGroupSepType
;
289 // OK to accept the separator.
290 // Special case: don't update currGroup if it is empty; this allows two grouping
291 // separators in a row in lenient mode.
292 if (currGroupCount
!= 0) {
293 currGroupOffset
= segment
.getOffset();
295 currGroupSepType
= isGrouping
? 1 : 2;
298 segment
.adjustOffset(actualGroupingString
.length());
300 segment
.adjustOffset(actualDecimalString
.length());
305 // Back up if there was a trailing grouping separator.
306 // Shift prev -> curr so we can check it as a final group.
307 if (currGroupSepType
!= 2 && currGroupCount
== 0) {
309 segment
.setOffset(currGroupOffset
);
310 currGroupOffset
= prevGroupOffset
;
311 currGroupSepType
= prevGroupSepType
;
312 currGroupCount
= prevGroupCount
;
313 prevGroupOffset
= -1;
314 prevGroupSepType
= 0;
318 // Validate final grouping sizes.
319 bool prevValidSecondary
= validateGroup(prevGroupSepType
, prevGroupCount
, false);
320 bool currValidPrimary
= validateGroup(currGroupSepType
, currGroupCount
, true);
321 if (!requireGroupingMatch
) {
322 // The cases we need to handle here are lone digits.
323 // Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
324 // See more examples in numberformattestspecification.txt
325 int32_t digitsToRemove
= 0;
326 if (!prevValidSecondary
) {
327 segment
.setOffset(prevGroupOffset
);
328 digitsToRemove
+= prevGroupCount
;
329 digitsToRemove
+= currGroupCount
;
330 } else if (!currValidPrimary
&& (prevGroupSepType
!= 0 || prevGroupCount
!= 0)) {
332 segment
.setOffset(currGroupOffset
);
333 digitsToRemove
+= currGroupCount
;
335 if (digitsToRemove
!= 0) {
336 digitsConsumed
.adjustMagnitude(-digitsToRemove
);
337 digitsConsumed
.truncate();
339 prevValidSecondary
= true;
340 currValidPrimary
= true;
342 if (currGroupSepType
!= 2 && (!prevValidSecondary
|| !currValidPrimary
)) {
344 digitsConsumed
.bogus
= true;
347 // Strings that start with a separator but have no digits,
348 // or strings that failed a grouping size check.
349 if (digitsConsumed
.bogus
) {
350 maybeMore
= maybeMore
|| (segment
.length() == 0);
351 segment
.setOffset(initialOffset
);
355 // We passed all inspections. Start post-processing.
357 // Adjust for fraction part.
358 digitsConsumed
.adjustMagnitude(-digitsAfterDecimalPlace
);
360 // Set the digits, either normal or exponent.
361 if (exponentSign
!= 0 && segment
.getOffset() != initialOffset
) {
362 bool overflow
= false;
363 if (digitsConsumed
.fitsInLong()) {
364 int64_t exponentLong
= digitsConsumed
.toLong(false);
365 U_ASSERT(exponentLong
>= 0);
366 if (exponentLong
<= INT32_MAX
) {
367 auto exponentInt
= static_cast<int32_t>(exponentLong
);
368 if (result
.quantity
.adjustMagnitude(exponentSign
* exponentInt
)) {
378 if (exponentSign
== -1) {
380 result
.quantity
.clear();
383 result
.quantity
.bogus
= true;
384 result
.flags
|= FLAG_INFINITY
;
388 result
.quantity
= digitsConsumed
;
391 // Set other information into the result and return.
392 if (!actualDecimalString
.isBogus()) {
393 result
.flags
|= FLAG_HAS_DECIMAL_SEPARATOR
;
395 result
.setCharsConsumed(segment
);
396 return segment
.length() == 0 || maybeMore
;
399 bool DecimalMatcher::validateGroup(int32_t sepType
, int32_t count
, bool isPrimary
) const {
400 if (requireGroupingMatch
) {
402 // No such group (prevGroup before first shift).
404 } else if (sepType
== 0) {
407 // No grouping separators is OK.
410 return count
!= 0 && count
<= grouping2
;
412 } else if (sepType
== 1) {
415 return count
== grouping1
;
417 return count
== grouping2
;
420 U_ASSERT(sepType
== 2);
421 // After the decimal separator.
426 // #11230: don't accept middle groups with only 1 digit.
434 bool DecimalMatcher::smokeTest(const StringSegment
& segment
) const {
435 // The common case uses a static leadSet for efficiency.
436 if (fLocalDigitStrings
.isNull() && leadSet
!= nullptr) {
437 return segment
.startsWith(*leadSet
);
439 if (segment
.startsWith(*separatorSet
) || u_isdigit(segment
.getCodePoint())) {
442 if (fLocalDigitStrings
.isNull()) {
445 for (int32_t i
= 0; i
< 10; i
++) {
446 if (segment
.startsWith(fLocalDigitStrings
[i
])) {
453 UnicodeString
DecimalMatcher::toString() const {
458 #endif /* #if !UCONFIG_NO_FORMATTING */