1 // © 2018 and later: Unicode, Inc. and others. 
   2 // License & terms of use: http://www.unicode.org/copyright.html 
   4 #include "unicode/utypes.h" 
   6 #if !UCONFIG_NO_FORMATTING 
   8 // Allow implicit conversion from char16_t* to UnicodeString for this file: 
   9 // Helpful in toString methods and elsewhere. 
  10 #define UNISTR_FROM_STRING_EXPLICIT 
  12 #include "numparse_types.h" 
  13 #include "numparse_decimal.h" 
  14 #include "static_unicode_sets.h" 
  15 #include "numparse_utils.h" 
  16 #include "unicode/uchar.h" 
  18 #include "number_decimalquantity.h" 
  19 #include "string_segment.h" 
  22 using namespace icu::numparse
; 
  23 using namespace icu::numparse::impl
; 
  26 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols
& symbols
, const Grouper
& grouper
, 
  27                                parse_flags_t parseFlags
) { 
  28     if (0 != (parseFlags 
& PARSE_FLAG_MONETARY_SEPARATORS
)) { 
  29         groupingSeparator 
= symbols
.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol
); 
  30         decimalSeparator 
= symbols
.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol
); 
  32         groupingSeparator 
= symbols
.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol
); 
  33         decimalSeparator 
= symbols
.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol
); 
  35     bool strictSeparators 
= 0 != (parseFlags 
& PARSE_FLAG_STRICT_SEPARATORS
); 
  36     unisets::Key groupingKey 
= strictSeparators 
? unisets::STRICT_ALL_SEPARATORS
 
  37                                                 : unisets::ALL_SEPARATORS
; 
  39     // Attempt to find separators in the static cache 
  41     groupingUniSet 
= unisets::get(groupingKey
); 
  42     unisets::Key decimalKey 
= unisets::chooseFrom( 
  44             strictSeparators 
? unisets::STRICT_COMMA 
: unisets::COMMA
, 
  45             strictSeparators 
? unisets::STRICT_PERIOD 
: unisets::PERIOD
); 
  46     if (decimalKey 
>= 0) { 
  47         decimalUniSet 
= unisets::get(decimalKey
); 
  48     } else if (!decimalSeparator
.isEmpty()) { 
  49         auto* set 
= new UnicodeSet(); 
  50         set
->add(decimalSeparator
.char32At(0)); 
  53         fLocalDecimalUniSet
.adoptInstead(set
); 
  55         decimalUniSet 
= unisets::get(unisets::EMPTY
); 
  58     if (groupingKey 
>= 0 && decimalKey 
>= 0) { 
  59         // Everything is available in the static cache 
  60         separatorSet 
= groupingUniSet
; 
  61         leadSet 
= unisets::get( 
  62                 strictSeparators 
? unisets::DIGITS_OR_ALL_SEPARATORS
 
  63                                  : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS
); 
  65         auto* set 
= new UnicodeSet(); 
  66         set
->addAll(*groupingUniSet
); 
  67         set
->addAll(*decimalUniSet
); 
  70         fLocalSeparatorSet
.adoptInstead(set
); 
  74     UChar32 cpZero 
= symbols
.getCodePointZero(); 
  75     if (cpZero 
== -1 || !u_isdigit(cpZero
) || u_digit(cpZero
, 10) != 0) { 
  76         // Uncommon case: okay to allocate. 
  77         auto digitStrings 
= new UnicodeString
[10]; 
  78         fLocalDigitStrings
.adoptInstead(digitStrings
); 
  79         for (int32_t i 
= 0; i 
<= 9; i
++) { 
  80             digitStrings
[i
] = symbols
.getConstDigitSymbol(i
); 
  84     requireGroupingMatch 
= 0 != (parseFlags 
& PARSE_FLAG_STRICT_GROUPING_SIZE
); 
  85     groupingDisabled 
= 0 != (parseFlags 
& PARSE_FLAG_GROUPING_DISABLED
); 
  86     integerOnly 
= 0 != (parseFlags 
& PARSE_FLAG_INTEGER_ONLY
); 
  87     grouping1 
= grouper
.getPrimary(); 
  88     grouping2 
= grouper
.getSecondary(); 
  90     // Fraction grouping parsing is disabled for now but could be enabled later. 
  91     // See http://bugs.icu-project.org/trac/ticket/10794 
  92     // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED); 
  95 bool DecimalMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, UErrorCode
& status
) const { 
  96     return match(segment
, result
, 0, status
); 
  99 bool DecimalMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, int8_t exponentSign
, 
 101     if (result
.seenNumber() && exponentSign 
== 0) { 
 102         // A number has already been consumed. 
 104     } else if (exponentSign 
!= 0) { 
 105         // scientific notation always comes after the number 
 106         U_ASSERT(!result
.quantity
.bogus
); 
 109     // Initial offset before any character consumption. 
 110     int32_t initialOffset 
= segment
.getOffset(); 
 112     // Return value: whether to ask for more characters. 
 113     bool maybeMore 
= false; 
 115     // All digits consumed so far. 
 116     number::impl::DecimalQuantity digitsConsumed
; 
 117     digitsConsumed
.bogus 
= true; 
 119     // The total number of digits after the decimal place, used for scaling the result. 
 120     int32_t digitsAfterDecimalPlace 
= 0; 
 122     // The actual grouping and decimal separators used in the string. 
 123     // If non-null, we have seen that token. 
 124     UnicodeString actualGroupingString
; 
 125     UnicodeString actualDecimalString
; 
 126     actualGroupingString
.setToBogus(); 
 127     actualDecimalString
.setToBogus(); 
 129     // Information for two groups: the previous group and the current group. 
 131     // Each group has three pieces of information: 
 133     // Offset: the string position of the beginning of the group, including a leading separator 
 134     // if there was a leading separator. This is needed in case we need to rewind the parse to 
 138     // 0 => beginning of string 
 139     // 1 => lead separator is a grouping separator 
 140     // 2 => lead separator is a decimal separator 
 142     // Count: the number of digits in the group. If -1, the group has been validated. 
 143     int32_t currGroupOffset 
= 0; 
 144     int32_t currGroupSepType 
= 0; 
 145     int32_t currGroupCount 
= 0; 
 146     int32_t prevGroupOffset 
= -1; 
 147     int32_t prevGroupSepType 
= -1; 
 148     int32_t prevGroupCount 
= -1; 
 150     while (segment
.length() > 0) { 
 153         // Attempt to match a digit. 
 156         // Try by code point digit value. 
 157         UChar32 cp 
= segment
.getCodePoint(); 
 159             segment
.adjustOffset(U16_LENGTH(cp
)); 
 160             digit 
= static_cast<int8_t>(u_digit(cp
, 10)); 
 163         // Try by digit string. 
 164         if (digit 
== -1 && !fLocalDigitStrings
.isNull()) { 
 165             for (int32_t i 
= 0; i 
< 10; i
++) { 
 166                 const UnicodeString
& str 
= fLocalDigitStrings
[i
]; 
 170                 // The following test is Apple-specific, for <rdar://7632623>; 
 171                 // if \u3007 is treated as 0 for parsing, \u96F6 should be too. 
 172                 int32_t overlap 
= (segment
.startsWith(0x96F6) && fLocalDigitStrings
[0].charAt(0)==0x3007)? 
 173                     1: segment
.getCommonPrefixLength(str
); 
 174                 if (overlap 
== str
.length()) { 
 175                     segment
.adjustOffset(overlap
); 
 176                     digit 
= static_cast<int8_t>(i
); 
 179                 maybeMore 
= maybeMore 
|| (overlap 
== segment
.length()); 
 185             if (digitsConsumed
.bogus
) { 
 186                 digitsConsumed
.bogus 
= false; 
 187                 digitsConsumed
.clear(); 
 189             digitsConsumed
.appendDigit(digit
, 0, true); 
 191             if (!actualDecimalString
.isBogus()) { 
 192                 digitsAfterDecimalPlace
++; 
 197         // Attempt to match a literal grouping or decimal separator. 
 198         bool isDecimal 
= false; 
 199         bool isGrouping 
= false; 
 201         // 1) Attempt the decimal separator string literal. 
 202         // if (we have not seen a decimal separator yet) { ... } 
 203         if (actualDecimalString
.isBogus() && !decimalSeparator
.isEmpty()) { 
 204             int32_t overlap 
= segment
.getCommonPrefixLength(decimalSeparator
); 
 205             maybeMore 
= maybeMore 
|| (overlap 
== segment
.length()); 
 206             if (overlap 
== decimalSeparator
.length()) { 
 208                 actualDecimalString 
= decimalSeparator
; 
 212         // 2) Attempt to match the actual grouping string literal. 
 213         if (!actualGroupingString
.isBogus()) { 
 214             int32_t overlap 
= segment
.getCommonPrefixLength(actualGroupingString
); 
 215             maybeMore 
= maybeMore 
|| (overlap 
== segment
.length()); 
 216             if (overlap 
== actualGroupingString
.length()) { 
 221         // 2.5) Attempt to match a new the grouping separator string literal. 
 222         // if (we have not seen a grouping or decimal separator yet) { ... } 
 223         if (!groupingDisabled 
&& actualGroupingString
.isBogus() && actualDecimalString
.isBogus() && 
 224             !groupingSeparator
.isEmpty()) { 
 225             int32_t overlap 
= segment
.getCommonPrefixLength(groupingSeparator
); 
 226             maybeMore 
= maybeMore 
|| (overlap 
== segment
.length()); 
 227             if (overlap 
== groupingSeparator
.length()) { 
 229                 actualGroupingString 
= groupingSeparator
; 
 233         // 3) Attempt to match a decimal separator from the equivalence set. 
 234         // if (we have not seen a decimal separator yet) { ... } 
 235         // The !isGrouping is to confirm that we haven't yet matched the current character. 
 236         if (!isGrouping 
&& actualDecimalString
.isBogus()) { 
 237             if (decimalUniSet
->contains(cp
)) { 
 239                 actualDecimalString 
= UnicodeString(cp
); 
 243         // 4) Attempt to match a grouping separator from the equivalence set. 
 244         // if (we have not seen a grouping or decimal separator yet) { ... } 
 245         if (!groupingDisabled 
&& actualGroupingString
.isBogus() && actualDecimalString
.isBogus()) { 
 246             if (groupingUniSet
->contains(cp
)) { 
 248                 actualGroupingString 
= UnicodeString(cp
); 
 252         // Leave if we failed to match this as a separator. 
 253         if (!isDecimal 
&& !isGrouping
) { 
 257         // Check for conditions when we don't want to accept the separator. 
 258         if (isDecimal 
&& integerOnly
) { 
 260         } else if (currGroupSepType 
== 2 && isGrouping
) { 
 265         // Validate intermediate grouping sizes. 
 266         bool prevValidSecondary 
= validateGroup(prevGroupSepType
, prevGroupCount
, false); 
 267         bool currValidPrimary 
= validateGroup(currGroupSepType
, currGroupCount
, true); 
 268         if (!prevValidSecondary 
|| (isDecimal 
&& !currValidPrimary
)) { 
 269             // Invalid grouping sizes. 
 270             if (isGrouping 
&& currGroupCount 
== 0) { 
 271                 // Trailing grouping separators: these are taken care of below 
 272                 U_ASSERT(currGroupSepType 
== 1); 
 273             } else if (requireGroupingMatch
) { 
 274                 // Strict mode: reject the parse 
 275                 digitsConsumed
.clear(); 
 276                 digitsConsumed
.bogus 
= true; 
 279         } else if (requireGroupingMatch 
&& currGroupCount 
== 0 && currGroupSepType 
== 1) { 
 282             // Grouping sizes OK so far. 
 283             prevGroupOffset 
= currGroupOffset
; 
 284             prevGroupCount 
= currGroupCount
; 
 286                 // Do not validate this group any more. 
 287                 prevGroupSepType 
= -1; 
 289                 prevGroupSepType 
= currGroupSepType
; 
 293         // OK to accept the separator. 
 294         // Special case: don't update currGroup if it is empty; this allows two grouping 
 295         // separators in a row in lenient mode. 
 296         if (currGroupCount 
!= 0) { 
 297             currGroupOffset 
= segment
.getOffset(); 
 299         currGroupSepType 
= isGrouping 
? 1 : 2; 
 302             segment
.adjustOffset(actualGroupingString
.length()); 
 304             segment
.adjustOffset(actualDecimalString
.length()); 
 309     // Back up if there was a trailing grouping separator. 
 310     // Shift prev -> curr so we can check it as a final group. 
 311     if (currGroupSepType 
!= 2 && currGroupCount 
== 0) { 
 313         segment
.setOffset(currGroupOffset
); 
 314         currGroupOffset 
= prevGroupOffset
; 
 315         currGroupSepType 
= prevGroupSepType
; 
 316         currGroupCount 
= prevGroupCount
; 
 317         prevGroupOffset 
= -1; 
 318         prevGroupSepType 
= 0; 
 322     // Validate final grouping sizes. 
 323     bool prevValidSecondary 
= validateGroup(prevGroupSepType
, prevGroupCount
, false); 
 324     bool currValidPrimary 
= validateGroup(currGroupSepType
, currGroupCount
, true); 
 325     if (!requireGroupingMatch
) { 
 326         // The cases we need to handle here are lone digits. 
 327         // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1) 
 328         // See more examples in numberformattestspecification.txt 
 329         int32_t digitsToRemove 
= 0; 
 330         if (!prevValidSecondary
) { 
 331             segment
.setOffset(prevGroupOffset
); 
 332             digitsToRemove 
+= prevGroupCount
; 
 333             digitsToRemove 
+= currGroupCount
; 
 334         } else if (!currValidPrimary 
&& (prevGroupSepType 
!= 0 || prevGroupCount 
!= 0)) { 
 336             segment
.setOffset(currGroupOffset
); 
 337             digitsToRemove 
+= currGroupCount
; 
 339         if (digitsToRemove 
!= 0) { 
 340             digitsConsumed
.adjustMagnitude(-digitsToRemove
); 
 341             digitsConsumed
.truncate(); 
 343         prevValidSecondary 
= true; 
 344         currValidPrimary 
= true; 
 346     if (currGroupSepType 
!= 2 && (!prevValidSecondary 
|| !currValidPrimary
)) { 
 348         digitsConsumed
.bogus 
= true; 
 351     // Strings that start with a separator but have no digits, 
 352     // or strings that failed a grouping size check. 
 353     if (digitsConsumed
.bogus
) { 
 354         maybeMore 
= maybeMore 
|| (segment
.length() == 0); 
 355         segment
.setOffset(initialOffset
); 
 359     // We passed all inspections. Start post-processing. 
 361     // Adjust for fraction part. 
 362     digitsConsumed
.adjustMagnitude(-digitsAfterDecimalPlace
); 
 364     // Set the digits, either normal or exponent. 
 365     if (exponentSign 
!= 0 && segment
.getOffset() != initialOffset
) { 
 366         bool overflow 
= false; 
 367         if (digitsConsumed
.fitsInLong()) { 
 368             int64_t exponentLong 
= digitsConsumed
.toLong(false); 
 369             U_ASSERT(exponentLong 
>= 0); 
 370             if (exponentLong 
<= INT32_MAX
) { 
 371                 auto exponentInt 
= static_cast<int32_t>(exponentLong
); 
 372                 if (result
.quantity
.adjustMagnitude(exponentSign 
* exponentInt
)) { 
 382             if (exponentSign 
== -1) { 
 384                 result
.quantity
.clear(); 
 387                 result
.quantity
.bogus 
= true; 
 388                 result
.flags 
|= FLAG_INFINITY
; 
 392         result
.quantity 
= digitsConsumed
; 
 395     // Set other information into the result and return. 
 396     if (!actualDecimalString
.isBogus()) { 
 397         result
.flags 
|= FLAG_HAS_DECIMAL_SEPARATOR
; 
 399     result
.setCharsConsumed(segment
); 
 400     return segment
.length() == 0 || maybeMore
; 
 403 bool DecimalMatcher::validateGroup(int32_t sepType
, int32_t count
, bool isPrimary
) const { 
 404     if (requireGroupingMatch
) { 
 406             // No such group (prevGroup before first shift). 
 408         } else if (sepType 
== 0) { 
 411                 // No grouping separators is OK. 
 414                 // return count != 0 && count <= grouping2; 
 415                 return count 
<= grouping2
; // Apple <rdar://problem/38565910>, allow initial secondary group of 0 
 417         } else if (sepType 
== 1) { 
 420                 return count 
== grouping1
; 
 422                 return count 
== grouping2
; 
 425             U_ASSERT(sepType 
== 2); 
 426             // After the decimal separator. 
 431             // #11230: don't accept middle groups with only 1 digit. 
 439 bool DecimalMatcher::smokeTest(const StringSegment
& segment
) const { 
 440     // The common case uses a static leadSet for efficiency. 
 441     if (fLocalDigitStrings
.isNull() && leadSet 
!= nullptr) { 
 442         return segment
.startsWith(*leadSet
); 
 444     if (segment
.startsWith(*separatorSet
) || u_isdigit(segment
.getCodePoint())) { 
 447     if (fLocalDigitStrings
.isNull()) { 
 450     // The following test is Apple-specific, for <rdar://7632623>; 
 451     // if \u3007 is treated as 0 for parsing, \u96F6 should be too. 
 452     if (segment
.startsWith(0x96F6) && fLocalDigitStrings
[0].length()==1 && fLocalDigitStrings
[0].charAt(0)==0x3007) { 
 455     for (int32_t i 
= 0; i 
< 10; i
++) { 
 456         if (segment
.startsWith(fLocalDigitStrings
[i
])) { 
 463 UnicodeString 
DecimalMatcher::toString() const { 
 468 #endif /* #if !UCONFIG_NO_FORMATTING */