1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 #include "unicode/utypes.h"
6 #if !UCONFIG_NO_FORMATTING
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
12 #include "numparse_types.h"
13 #include "numparse_decimal.h"
14 #include "static_unicode_sets.h"
15 #include "numparse_utils.h"
16 #include "unicode/uchar.h"
18 #include "number_decimalquantity.h"
19 #include "string_segment.h"
22 using namespace icu::numparse
;
23 using namespace icu::numparse::impl
;
26 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols
& symbols
, const Grouper
& grouper
,
27 parse_flags_t parseFlags
) {
28 if (0 != (parseFlags
& PARSE_FLAG_MONETARY_SEPARATORS
)) {
29 groupingSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol
);
30 decimalSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol
);
32 groupingSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol
);
33 decimalSeparator
= symbols
.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol
);
35 bool strictSeparators
= 0 != (parseFlags
& PARSE_FLAG_STRICT_SEPARATORS
);
36 unisets::Key groupingKey
= strictSeparators
? unisets::STRICT_ALL_SEPARATORS
37 : unisets::ALL_SEPARATORS
;
39 // Attempt to find separators in the static cache
41 groupingUniSet
= unisets::get(groupingKey
);
42 unisets::Key decimalKey
= unisets::chooseFrom(
44 strictSeparators
? unisets::STRICT_COMMA
: unisets::COMMA
,
45 strictSeparators
? unisets::STRICT_PERIOD
: unisets::PERIOD
);
46 if (decimalKey
>= 0) {
47 decimalUniSet
= unisets::get(decimalKey
);
48 } else if (!decimalSeparator
.isEmpty()) {
49 auto* set
= new UnicodeSet();
50 set
->add(decimalSeparator
.char32At(0));
53 fLocalDecimalUniSet
.adoptInstead(set
);
55 decimalUniSet
= unisets::get(unisets::EMPTY
);
58 if (groupingKey
>= 0 && decimalKey
>= 0) {
59 // Everything is available in the static cache
60 separatorSet
= groupingUniSet
;
61 leadSet
= unisets::get(
62 strictSeparators
? unisets::DIGITS_OR_ALL_SEPARATORS
63 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS
);
65 auto* set
= new UnicodeSet();
66 set
->addAll(*groupingUniSet
);
67 set
->addAll(*decimalUniSet
);
70 fLocalSeparatorSet
.adoptInstead(set
);
74 UChar32 cpZero
= symbols
.getCodePointZero();
75 if (cpZero
== -1 || !u_isdigit(cpZero
) || u_digit(cpZero
, 10) != 0) {
76 // Uncommon case: okay to allocate.
77 auto digitStrings
= new UnicodeString
[10];
78 fLocalDigitStrings
.adoptInstead(digitStrings
);
79 for (int32_t i
= 0; i
<= 9; i
++) {
80 digitStrings
[i
] = symbols
.getConstDigitSymbol(i
);
84 requireGroupingMatch
= 0 != (parseFlags
& PARSE_FLAG_STRICT_GROUPING_SIZE
);
85 groupingDisabled
= 0 != (parseFlags
& PARSE_FLAG_GROUPING_DISABLED
);
86 integerOnly
= 0 != (parseFlags
& PARSE_FLAG_INTEGER_ONLY
);
87 grouping1
= grouper
.getPrimary();
88 grouping2
= grouper
.getSecondary();
90 // Fraction grouping parsing is disabled for now but could be enabled later.
91 // See http://bugs.icu-project.org/trac/ticket/10794
92 // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
95 bool DecimalMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, UErrorCode
& status
) const {
96 return match(segment
, result
, 0, status
);
99 bool DecimalMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, int8_t exponentSign
,
101 if (result
.seenNumber() && exponentSign
== 0) {
102 // A number has already been consumed.
104 } else if (exponentSign
!= 0) {
105 // scientific notation always comes after the number
106 U_ASSERT(!result
.quantity
.bogus
);
109 // Initial offset before any character consumption.
110 int32_t initialOffset
= segment
.getOffset();
112 // Return value: whether to ask for more characters.
113 bool maybeMore
= false;
115 // All digits consumed so far.
116 number::impl::DecimalQuantity digitsConsumed
;
117 digitsConsumed
.bogus
= true;
119 // The total number of digits after the decimal place, used for scaling the result.
120 int32_t digitsAfterDecimalPlace
= 0;
122 // The actual grouping and decimal separators used in the string.
123 // If non-null, we have seen that token.
124 UnicodeString actualGroupingString
;
125 UnicodeString actualDecimalString
;
126 actualGroupingString
.setToBogus();
127 actualDecimalString
.setToBogus();
129 // Information for two groups: the previous group and the current group.
131 // Each group has three pieces of information:
133 // Offset: the string position of the beginning of the group, including a leading separator
134 // if there was a leading separator. This is needed in case we need to rewind the parse to
138 // 0 => beginning of string
139 // 1 => lead separator is a grouping separator
140 // 2 => lead separator is a decimal separator
142 // Count: the number of digits in the group. If -1, the group has been validated.
143 int32_t currGroupOffset
= 0;
144 int32_t currGroupSepType
= 0;
145 int32_t currGroupCount
= 0;
146 int32_t prevGroupOffset
= -1;
147 int32_t prevGroupSepType
= -1;
148 int32_t prevGroupCount
= -1;
150 while (segment
.length() > 0) {
153 // Attempt to match a digit.
156 // Try by code point digit value.
157 UChar32 cp
= segment
.getCodePoint();
159 segment
.adjustOffset(U16_LENGTH(cp
));
160 digit
= static_cast<int8_t>(u_digit(cp
, 10));
163 // Try by digit string.
164 if (digit
== -1 && !fLocalDigitStrings
.isNull()) {
165 for (int32_t i
= 0; i
< 10; i
++) {
166 const UnicodeString
& str
= fLocalDigitStrings
[i
];
170 // The following test is Apple-specific, for <rdar://7632623>;
171 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
172 int32_t overlap
= (segment
.startsWith(0x96F6) && fLocalDigitStrings
[0].charAt(0)==0x3007)?
173 1: segment
.getCommonPrefixLength(str
);
174 if (overlap
== str
.length()) {
175 segment
.adjustOffset(overlap
);
176 digit
= static_cast<int8_t>(i
);
179 maybeMore
= maybeMore
|| (overlap
== segment
.length());
185 if (digitsConsumed
.bogus
) {
186 digitsConsumed
.bogus
= false;
187 digitsConsumed
.clear();
189 digitsConsumed
.appendDigit(digit
, 0, true);
191 if (!actualDecimalString
.isBogus()) {
192 digitsAfterDecimalPlace
++;
197 // Attempt to match a literal grouping or decimal separator.
198 bool isDecimal
= false;
199 bool isGrouping
= false;
201 // 1) Attempt the decimal separator string literal.
202 // if (we have not seen a decimal separator yet) { ... }
203 if (actualDecimalString
.isBogus() && !decimalSeparator
.isEmpty()) {
204 int32_t overlap
= segment
.getCommonPrefixLength(decimalSeparator
);
205 maybeMore
= maybeMore
|| (overlap
== segment
.length());
206 if (overlap
== decimalSeparator
.length()) {
208 actualDecimalString
= decimalSeparator
;
212 // 2) Attempt to match the actual grouping string literal.
213 if (!actualGroupingString
.isBogus()) {
214 int32_t overlap
= segment
.getCommonPrefixLength(actualGroupingString
);
215 maybeMore
= maybeMore
|| (overlap
== segment
.length());
216 if (overlap
== actualGroupingString
.length()) {
221 // 2.5) Attempt to match a new the grouping separator string literal.
222 // if (we have not seen a grouping or decimal separator yet) { ... }
223 if (!groupingDisabled
&& actualGroupingString
.isBogus() && actualDecimalString
.isBogus() &&
224 !groupingSeparator
.isEmpty()) {
225 int32_t overlap
= segment
.getCommonPrefixLength(groupingSeparator
);
226 maybeMore
= maybeMore
|| (overlap
== segment
.length());
227 if (overlap
== groupingSeparator
.length()) {
229 actualGroupingString
= groupingSeparator
;
233 // 3) Attempt to match a decimal separator from the equivalence set.
234 // if (we have not seen a decimal separator yet) { ... }
235 // The !isGrouping is to confirm that we haven't yet matched the current character.
236 if (!isGrouping
&& actualDecimalString
.isBogus()) {
237 if (decimalUniSet
->contains(cp
)) {
239 actualDecimalString
= UnicodeString(cp
);
243 // 4) Attempt to match a grouping separator from the equivalence set.
244 // if (we have not seen a grouping or decimal separator yet) { ... }
245 if (!groupingDisabled
&& actualGroupingString
.isBogus() && actualDecimalString
.isBogus()) {
246 if (groupingUniSet
->contains(cp
)) {
248 actualGroupingString
= UnicodeString(cp
);
252 // Leave if we failed to match this as a separator.
253 if (!isDecimal
&& !isGrouping
) {
257 // Check for conditions when we don't want to accept the separator.
258 if (isDecimal
&& integerOnly
) {
260 } else if (currGroupSepType
== 2 && isGrouping
) {
265 // Validate intermediate grouping sizes.
266 bool prevValidSecondary
= validateGroup(prevGroupSepType
, prevGroupCount
, false);
267 bool currValidPrimary
= validateGroup(currGroupSepType
, currGroupCount
, true);
268 if (!prevValidSecondary
|| (isDecimal
&& !currValidPrimary
)) {
269 // Invalid grouping sizes.
270 if (isGrouping
&& currGroupCount
== 0) {
271 // Trailing grouping separators: these are taken care of below
272 U_ASSERT(currGroupSepType
== 1);
273 } else if (requireGroupingMatch
) {
274 // Strict mode: reject the parse
275 digitsConsumed
.clear();
276 digitsConsumed
.bogus
= true;
279 } else if (requireGroupingMatch
&& currGroupCount
== 0 && currGroupSepType
== 1) {
282 // Grouping sizes OK so far.
283 prevGroupOffset
= currGroupOffset
;
284 prevGroupCount
= currGroupCount
;
286 // Do not validate this group any more.
287 prevGroupSepType
= -1;
289 prevGroupSepType
= currGroupSepType
;
293 // OK to accept the separator.
294 // Special case: don't update currGroup if it is empty; this allows two grouping
295 // separators in a row in lenient mode.
296 if (currGroupCount
!= 0) {
297 currGroupOffset
= segment
.getOffset();
299 currGroupSepType
= isGrouping
? 1 : 2;
302 segment
.adjustOffset(actualGroupingString
.length());
304 segment
.adjustOffset(actualDecimalString
.length());
309 // Back up if there was a trailing grouping separator.
310 // Shift prev -> curr so we can check it as a final group.
311 if (currGroupSepType
!= 2 && currGroupCount
== 0) {
313 segment
.setOffset(currGroupOffset
);
314 currGroupOffset
= prevGroupOffset
;
315 currGroupSepType
= prevGroupSepType
;
316 currGroupCount
= prevGroupCount
;
317 prevGroupOffset
= -1;
318 prevGroupSepType
= 0;
322 // Validate final grouping sizes.
323 bool prevValidSecondary
= validateGroup(prevGroupSepType
, prevGroupCount
, false);
324 bool currValidPrimary
= validateGroup(currGroupSepType
, currGroupCount
, true);
325 if (!requireGroupingMatch
) {
326 // The cases we need to handle here are lone digits.
327 // Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
328 // See more examples in numberformattestspecification.txt
329 int32_t digitsToRemove
= 0;
330 if (!prevValidSecondary
) {
331 segment
.setOffset(prevGroupOffset
);
332 digitsToRemove
+= prevGroupCount
;
333 digitsToRemove
+= currGroupCount
;
334 } else if (!currValidPrimary
&& (prevGroupSepType
!= 0 || prevGroupCount
!= 0)) {
336 segment
.setOffset(currGroupOffset
);
337 digitsToRemove
+= currGroupCount
;
339 if (digitsToRemove
!= 0) {
340 digitsConsumed
.adjustMagnitude(-digitsToRemove
);
341 digitsConsumed
.truncate();
343 prevValidSecondary
= true;
344 currValidPrimary
= true;
346 if (currGroupSepType
!= 2 && (!prevValidSecondary
|| !currValidPrimary
)) {
348 digitsConsumed
.bogus
= true;
351 // Strings that start with a separator but have no digits,
352 // or strings that failed a grouping size check.
353 if (digitsConsumed
.bogus
) {
354 maybeMore
= maybeMore
|| (segment
.length() == 0);
355 segment
.setOffset(initialOffset
);
359 // We passed all inspections. Start post-processing.
361 // Adjust for fraction part.
362 digitsConsumed
.adjustMagnitude(-digitsAfterDecimalPlace
);
364 // Set the digits, either normal or exponent.
365 if (exponentSign
!= 0 && segment
.getOffset() != initialOffset
) {
366 bool overflow
= false;
367 if (digitsConsumed
.fitsInLong()) {
368 int64_t exponentLong
= digitsConsumed
.toLong(false);
369 U_ASSERT(exponentLong
>= 0);
370 if (exponentLong
<= INT32_MAX
) {
371 auto exponentInt
= static_cast<int32_t>(exponentLong
);
372 if (result
.quantity
.adjustMagnitude(exponentSign
* exponentInt
)) {
382 if (exponentSign
== -1) {
384 result
.quantity
.clear();
387 result
.quantity
.bogus
= true;
388 result
.flags
|= FLAG_INFINITY
;
392 result
.quantity
= digitsConsumed
;
395 // Set other information into the result and return.
396 if (!actualDecimalString
.isBogus()) {
397 result
.flags
|= FLAG_HAS_DECIMAL_SEPARATOR
;
399 result
.setCharsConsumed(segment
);
400 return segment
.length() == 0 || maybeMore
;
403 bool DecimalMatcher::validateGroup(int32_t sepType
, int32_t count
, bool isPrimary
) const {
404 if (requireGroupingMatch
) {
406 // No such group (prevGroup before first shift).
408 } else if (sepType
== 0) {
411 // No grouping separators is OK.
414 // return count != 0 && count <= grouping2;
415 return count
<= grouping2
; // Apple <rdar://problem/38565910>, allow initial secondary group of 0
417 } else if (sepType
== 1) {
420 return count
== grouping1
;
422 return count
== grouping2
;
425 U_ASSERT(sepType
== 2);
426 // After the decimal separator.
431 // #11230: don't accept middle groups with only 1 digit.
439 bool DecimalMatcher::smokeTest(const StringSegment
& segment
) const {
440 // The common case uses a static leadSet for efficiency.
441 if (fLocalDigitStrings
.isNull() && leadSet
!= nullptr) {
442 return segment
.startsWith(*leadSet
);
444 if (segment
.startsWith(*separatorSet
) || u_isdigit(segment
.getCodePoint())) {
447 if (fLocalDigitStrings
.isNull()) {
450 // The following test is Apple-specific, for <rdar://7632623>;
451 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
452 if (segment
.startsWith(0x96F6) && fLocalDigitStrings
[0].length()==1 && fLocalDigitStrings
[0].charAt(0)==0x3007) {
455 for (int32_t i
= 0; i
< 10; i
++) {
456 if (segment
.startsWith(fLocalDigitStrings
[i
])) {
463 UnicodeString
DecimalMatcher::toString() const {
468 #endif /* #if !UCONFIG_NO_FORMATTING */