icuSources/i18n/numparse_decimal.cpp

   1 // © 2018 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3
   4 #include "unicode/utypes.h"
   5
   6 #if !UCONFIG_NO_FORMATTING
   7
   8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
   9 // Helpful in toString methods and elsewhere.
  10 #define UNISTR_FROM_STRING_EXPLICIT
  11
  12 #include "numparse_types.h"
  13 #include "numparse_decimal.h"
  14 #include "static_unicode_sets.h"
  15 #include "numparse_utils.h"
  16 #include "unicode/uchar.h"
  17 #include "putilimp.h"
  18 #include "number_decimalquantity.h"
  19 #include "string_segment.h"
  20
  21 using namespace icu;
  22 using namespace icu::numparse;
  23 using namespace icu::numparse::impl;
  24
  25
  26 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
  27                                parse_flags_t parseFlags) {
  28     if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
  29         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
  30         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
  31     } else {
  32         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
  33         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
  34     }
  35     bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
  36     unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
  37                                                 : unisets::ALL_SEPARATORS;
  38
  39     // Attempt to find separators in the static cache
  40
  41     groupingUniSet = unisets::get(groupingKey);
  42     unisets::Key decimalKey = unisets::chooseFrom(
  43             decimalSeparator,
  44             strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
  45             strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
  46     if (decimalKey >= 0) {
  47         decimalUniSet = unisets::get(decimalKey);
  48     } else if (!decimalSeparator.isEmpty()) {
  49         auto* set = new UnicodeSet();
  50         set->add(decimalSeparator.char32At(0));
  51         set->freeze();
  52         decimalUniSet = set;
  53         fLocalDecimalUniSet.adoptInstead(set);
  54     } else {
  55         decimalUniSet = unisets::get(unisets::EMPTY);
  56     }
  57
  58     if (groupingKey >= 0 && decimalKey >= 0) {
  59         // Everything is available in the static cache
  60         separatorSet = groupingUniSet;
  61         leadSet = unisets::get(
  62                 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
  63                                  : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
  64     } else {
  65         auto* set = new UnicodeSet();
  66         set->addAll(*groupingUniSet);
  67         set->addAll(*decimalUniSet);
  68         set->freeze();
  69         separatorSet = set;
  70         fLocalSeparatorSet.adoptInstead(set);
  71         leadSet = nullptr;
  72     }
  73
  74     UChar32 cpZero = symbols.getCodePointZero();
  75     if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
  76         // Uncommon case: okay to allocate.
  77         auto digitStrings = new UnicodeString[10];
  78         fLocalDigitStrings.adoptInstead(digitStrings);
  79         for (int32_t i = 0; i <= 9; i++) {
  80             digitStrings[i] = symbols.getConstDigitSymbol(i);
  81         }
  82     }
  83
  84     requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
  85     groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
  86     integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
  87     grouping1 = grouper.getPrimary();
  88     grouping2 = grouper.getSecondary();
  89
  90     // Fraction grouping parsing is disabled for now but could be enabled later.
  91     // See http://bugs.icu-project.org/trac/ticket/10794
  92     // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
  93 }
  94
  95 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
  96     return match(segment, result, 0, status);
  97 }
  98
  99 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
 100                            UErrorCode&) const {
 101     if (result.seenNumber() && exponentSign == 0) {
 102         // A number has already been consumed.
 103         return false;
 104     } else if (exponentSign != 0) {
 105         // scientific notation always comes after the number
 106         U_ASSERT(!result.quantity.bogus);
 107     }
 108
 109     // Initial offset before any character consumption.
 110     int32_t initialOffset = segment.getOffset();
 111
 112     // Return value: whether to ask for more characters.
 113     bool maybeMore = false;
 114
 115     // All digits consumed so far.
 116     number::impl::DecimalQuantity digitsConsumed;
 117     digitsConsumed.bogus = true;
 118
 119     // The total number of digits after the decimal place, used for scaling the result.
 120     int32_t digitsAfterDecimalPlace = 0;
 121
 122     // The actual grouping and decimal separators used in the string.
 123     // If non-null, we have seen that token.
 124     UnicodeString actualGroupingString;
 125     UnicodeString actualDecimalString;
 126     actualGroupingString.setToBogus();
 127     actualDecimalString.setToBogus();
 128
 129     // Information for two groups: the previous group and the current group.
 130     //
 131     // Each group has three pieces of information:
 132     //
 133     // Offset: the string position of the beginning of the group, including a leading separator
 134     // if there was a leading separator. This is needed in case we need to rewind the parse to
 135     // that position.
 136     //
 137     // Separator type:
 138     // 0 => beginning of string
 139     // 1 => lead separator is a grouping separator
 140     // 2 => lead separator is a decimal separator
 141     //
 142     // Count: the number of digits in the group. If -1, the group has been validated.
 143     int32_t currGroupOffset = 0;
 144     int32_t currGroupSepType = 0;
 145     int32_t currGroupCount = 0;
 146     int32_t prevGroupOffset = -1;
 147     int32_t prevGroupSepType = -1;
 148     int32_t prevGroupCount = -1;
 149
 150     while (segment.length() > 0) {
 151         maybeMore = false;
 152
 153         // Attempt to match a digit.
 154         int8_t digit = -1;
 155
 156         // Try by code point digit value.
 157         UChar32 cp = segment.getCodePoint();
 158         if (u_isdigit(cp)) {
 159             segment.adjustOffset(U16_LENGTH(cp));
 160             digit = static_cast<int8_t>(u_digit(cp, 10));
 161         }
 162
 163         // Try by digit string.
 164         if (digit == -1 && !fLocalDigitStrings.isNull()) {
 165             for (int32_t i = 0; i < 10; i++) {
 166                 const UnicodeString& str = fLocalDigitStrings[i];
 167                 if (str.isEmpty()) {
 168                     continue;
 169                 }
 170                 // The following test is Apple-specific, for <rdar://7632623>;
 171                 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
 172                 int32_t overlap = (segment.startsWith(0x96F6) && fLocalDigitStrings[0].charAt(0)==0x3007)?
 173                     1: segment.getCommonPrefixLength(str);
 174                 if (overlap == str.length()) {
 175                     segment.adjustOffset(overlap);
 176                     digit = static_cast<int8_t>(i);
 177                     break;
 178                 }
 179                 maybeMore = maybeMore || (overlap == segment.length());
 180             }
 181         }
 182
 183         if (digit >= 0) {
 184             // Digit was found.
 185             if (digitsConsumed.bogus) {
 186                 digitsConsumed.bogus = false;
 187                 digitsConsumed.clear();
 188             }
 189             digitsConsumed.appendDigit(digit, 0, true);
 190             currGroupCount++;
 191             if (!actualDecimalString.isBogus()) {
 192                 digitsAfterDecimalPlace++;
 193             }
 194             continue;
 195         }
 196
 197         // Attempt to match a literal grouping or decimal separator.
 198         bool isDecimal = false;
 199         bool isGrouping = false;
 200
 201         // 1) Attempt the decimal separator string literal.
 202         // if (we have not seen a decimal separator yet) { ... }
 203         if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
 204             int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
 205             maybeMore = maybeMore || (overlap == segment.length());
 206             if (overlap == decimalSeparator.length()) {
 207                 isDecimal = true;
 208                 actualDecimalString = decimalSeparator;
 209             }
 210         }
 211
 212         // 2) Attempt to match the actual grouping string literal.
 213         if (!actualGroupingString.isBogus()) {
 214             int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
 215             maybeMore = maybeMore || (overlap == segment.length());
 216             if (overlap == actualGroupingString.length()) {
 217                 isGrouping = true;
 218             }
 219         }
 220
 221         // 2.5) Attempt to match a new the grouping separator string literal.
 222         // if (we have not seen a grouping or decimal separator yet) { ... }
 223         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
 224             !groupingSeparator.isEmpty()) {
 225             int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
 226             maybeMore = maybeMore || (overlap == segment.length());
 227             if (overlap == groupingSeparator.length()) {
 228                 isGrouping = true;
 229                 actualGroupingString = groupingSeparator;
 230             }
 231         }
 232
 233         // 3) Attempt to match a decimal separator from the equivalence set.
 234         // if (we have not seen a decimal separator yet) { ... }
 235         // The !isGrouping is to confirm that we haven't yet matched the current character.
 236         if (!isGrouping && actualDecimalString.isBogus()) {
 237             if (decimalUniSet->contains(cp)) {
 238                 isDecimal = true;
 239                 actualDecimalString = UnicodeString(cp);
 240             }
 241         }
 242
 243         // 4) Attempt to match a grouping separator from the equivalence set.
 244         // if (we have not seen a grouping or decimal separator yet) { ... }
 245         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
 246             if (groupingUniSet->contains(cp)) {
 247                 isGrouping = true;
 248                 actualGroupingString = UnicodeString(cp);
 249             }
 250         }
 251
 252         // Leave if we failed to match this as a separator.
 253         if (!isDecimal && !isGrouping) {
 254             break;
 255         }
 256
 257         // Check for conditions when we don't want to accept the separator.
 258         if (isDecimal && integerOnly) {
 259             break;
 260         } else if (currGroupSepType == 2 && isGrouping) {
 261             // Fraction grouping
 262             break;
 263         }
 264
 265         // Validate intermediate grouping sizes.
 266         bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
 267         bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
 268         if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
 269             // Invalid grouping sizes.
 270             if (isGrouping && currGroupCount == 0) {
 271                 // Trailing grouping separators: these are taken care of below
 272                 U_ASSERT(currGroupSepType == 1);
 273             } else if (requireGroupingMatch) {
 274                 // Strict mode: reject the parse
 275                 digitsConsumed.clear();
 276                 digitsConsumed.bogus = true;
 277             }
 278             break;
 279         } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
 280             break;
 281         } else {
 282             // Grouping sizes OK so far.
 283             prevGroupOffset = currGroupOffset;
 284             prevGroupCount = currGroupCount;
 285             if (isDecimal) {
 286                 // Do not validate this group any more.
 287                 prevGroupSepType = -1;
 288             } else {
 289                 prevGroupSepType = currGroupSepType;
 290             }
 291         }
 292
 293         // OK to accept the separator.
 294         // Special case: don't update currGroup if it is empty; this allows two grouping
 295         // separators in a row in lenient mode.
 296         if (currGroupCount != 0) {
 297             currGroupOffset = segment.getOffset();
 298         }
 299         currGroupSepType = isGrouping ? 1 : 2;
 300         currGroupCount = 0;
 301         if (isGrouping) {
 302             segment.adjustOffset(actualGroupingString.length());
 303         } else {
 304             segment.adjustOffset(actualDecimalString.length());
 305         }
 306     }
 307
 308     // End of main loop.
 309     // Back up if there was a trailing grouping separator.
 310     // Shift prev -> curr so we can check it as a final group.
 311     if (currGroupSepType != 2 && currGroupCount == 0) {
 312         maybeMore = true;
 313         segment.setOffset(currGroupOffset);
 314         currGroupOffset = prevGroupOffset;
 315         currGroupSepType = prevGroupSepType;
 316         currGroupCount = prevGroupCount;
 317         prevGroupOffset = -1;
 318         prevGroupSepType = 0;
 319         prevGroupCount = 1;
 320     }
 321
 322     // Validate final grouping sizes.
 323     bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
 324     bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
 325     if (!requireGroupingMatch) {
 326         // The cases we need to handle here are lone digits.
 327         // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1)
 328         // See more examples in numberformattestspecification.txt
 329         int32_t digitsToRemove = 0;
 330         if (!prevValidSecondary) {
 331             segment.setOffset(prevGroupOffset);
 332             digitsToRemove += prevGroupCount;
 333             digitsToRemove += currGroupCount;
 334         } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
 335             maybeMore = true;
 336             segment.setOffset(currGroupOffset);
 337             digitsToRemove += currGroupCount;
 338         }
 339         if (digitsToRemove != 0) {
 340             digitsConsumed.adjustMagnitude(-digitsToRemove);
 341             digitsConsumed.truncate();
 342         }
 343         prevValidSecondary = true;
 344         currValidPrimary = true;
 345     }
 346     if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
 347         // Grouping failure.
 348         digitsConsumed.bogus = true;
 349     }
 350
 351     // Strings that start with a separator but have no digits,
 352     // or strings that failed a grouping size check.
 353     if (digitsConsumed.bogus) {
 354         maybeMore = maybeMore || (segment.length() == 0);
 355         segment.setOffset(initialOffset);
 356         return maybeMore;
 357     }
 358
 359     // We passed all inspections. Start post-processing.
 360
 361     // Adjust for fraction part.
 362     digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
 363
 364     // Set the digits, either normal or exponent.
 365     if (exponentSign != 0 && segment.getOffset() != initialOffset) {
 366         bool overflow = false;
 367         if (digitsConsumed.fitsInLong()) {
 368             int64_t exponentLong = digitsConsumed.toLong(false);
 369             U_ASSERT(exponentLong >= 0);
 370             if (exponentLong <= INT32_MAX) {
 371                 auto exponentInt = static_cast<int32_t>(exponentLong);
 372                 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
 373                     overflow = true;
 374                 }
 375             } else {
 376                 overflow = true;
 377             }
 378         } else {
 379             overflow = true;
 380         }
 381         if (overflow) {
 382             if (exponentSign == -1) {
 383                 // Set to zero
 384                 result.quantity.clear();
 385             } else {
 386                 // Set to infinity
 387                 result.quantity.bogus = true;
 388                 result.flags |= FLAG_INFINITY;
 389             }
 390         }
 391     } else {
 392         result.quantity = digitsConsumed;
 393     }
 394
 395     // Set other information into the result and return.
 396     if (!actualDecimalString.isBogus()) {
 397         result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
 398     }
 399     result.setCharsConsumed(segment);
 400     return segment.length() == 0 || maybeMore;
 401 }
 402
 403 bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
 404     if (requireGroupingMatch) {
 405         if (sepType == -1) {
 406             // No such group (prevGroup before first shift).
 407             return true;
 408         } else if (sepType == 0) {
 409             // First group.
 410             if (isPrimary) {
 411                 // No grouping separators is OK.
 412                 return true;
 413             } else {
 414                 // return count != 0 && count <= grouping2;
 415                 return count <= grouping2; // Apple <rdar://problem/38565910>, allow initial secondary group of 0
 416             }
 417         } else if (sepType == 1) {
 418             // Middle group.
 419             if (isPrimary) {
 420                 return count == grouping1;
 421             } else {
 422                 return count == grouping2;
 423             }
 424         } else {
 425             U_ASSERT(sepType == 2);
 426             // After the decimal separator.
 427             return true;
 428         }
 429     } else {
 430         if (sepType == 1) {
 431             // #11230: don't accept middle groups with only 1 digit.
 432             return count != 1;
 433         } else {
 434             return true;
 435         }
 436     }
 437 }
 438
 439 bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
 440     // The common case uses a static leadSet for efficiency.
 441     if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
 442         return segment.startsWith(*leadSet);
 443     }
 444     if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
 445         return true;
 446     }
 447     if (fLocalDigitStrings.isNull()) {
 448         return false;
 449     }
 450     // The following test is Apple-specific, for <rdar://7632623>;
 451     // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
 452     if (segment.startsWith(0x96F6) && fLocalDigitStrings[0].length()==1 && fLocalDigitStrings[0].charAt(0)==0x3007) {
 453         return true;
 454     }
 455     for (int32_t i = 0; i < 10; i++) {
 456         if (segment.startsWith(fLocalDigitStrings[i])) {
 457             return true;
 458         }
 459     }
 460     return false;
 461 }
 462
 463 UnicodeString DecimalMatcher::toString() const {
 464     return u"<Decimal>";
 465 }
 466
 467
 468 #endif /* #if !UCONFIG_NO_FORMATTING */