icuSources/i18n/numparse_decimal.cpp

   1 // © 2018 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3
   4 #include "unicode/utypes.h"
   5
   6 #if !UCONFIG_NO_FORMATTING
   7
   8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
   9 // Helpful in toString methods and elsewhere.
  10 #define UNISTR_FROM_STRING_EXPLICIT
  11
  12 #include "numparse_types.h"
  13 #include "numparse_decimal.h"
  14 #include "static_unicode_sets.h"
  15 #include "numparse_utils.h"
  16 #include "unicode/uchar.h"
  17 #include "putilimp.h"
  18 #include "number_decimalquantity.h"
  19
  20 using namespace icu;
  21 using namespace icu::numparse;
  22 using namespace icu::numparse::impl;
  23
  24
  25 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
  26                                parse_flags_t parseFlags) {
  27     if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
  28         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
  29         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
  30     } else {
  31         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
  32         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
  33     }
  34     bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
  35     unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
  36                                                 : unisets::ALL_SEPARATORS;
  37
  38     // Attempt to find separators in the static cache
  39
  40     groupingUniSet = unisets::get(groupingKey);
  41     unisets::Key decimalKey = unisets::chooseFrom(
  42             decimalSeparator,
  43             strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
  44             strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
  45     if (decimalKey >= 0) {
  46         decimalUniSet = unisets::get(decimalKey);
  47     } else if (!decimalSeparator.isEmpty()) {
  48         auto* set = new UnicodeSet();
  49         set->add(decimalSeparator.char32At(0));
  50         set->freeze();
  51         decimalUniSet = set;
  52         fLocalDecimalUniSet.adoptInstead(set);
  53     } else {
  54         decimalUniSet = unisets::get(unisets::EMPTY);
  55     }
  56
  57     if (groupingKey >= 0 && decimalKey >= 0) {
  58         // Everything is available in the static cache
  59         separatorSet = groupingUniSet;
  60         leadSet = unisets::get(
  61                 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
  62                                  : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
  63     } else {
  64         auto* set = new UnicodeSet();
  65         set->addAll(*groupingUniSet);
  66         set->addAll(*decimalUniSet);
  67         set->freeze();
  68         separatorSet = set;
  69         fLocalSeparatorSet.adoptInstead(set);
  70         leadSet = nullptr;
  71     }
  72
  73     UChar32 cpZero = symbols.getCodePointZero();
  74     if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
  75         // Uncommon case: okay to allocate.
  76         auto digitStrings = new UnicodeString[10];
  77         fLocalDigitStrings.adoptInstead(digitStrings);
  78         for (int32_t i = 0; i <= 9; i++) {
  79             digitStrings[i] = symbols.getConstDigitSymbol(i);
  80         }
  81     }
  82
  83     requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
  84     groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
  85     integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
  86     grouping1 = grouper.getPrimary();
  87     grouping2 = grouper.getSecondary();
  88
  89     // Fraction grouping parsing is disabled for now but could be enabled later.
  90     // See http://bugs.icu-project.org/trac/ticket/10794
  91     // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
  92 }
  93
  94 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
  95     return match(segment, result, 0, status);
  96 }
  97
  98 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
  99                            UErrorCode&) const {
 100     if (result.seenNumber() && exponentSign == 0) {
 101         // A number has already been consumed.
 102         return false;
 103     } else if (exponentSign != 0) {
 104         // scientific notation always comes after the number
 105         U_ASSERT(!result.quantity.bogus);
 106     }
 107
 108     // Initial offset before any character consumption.
 109     int32_t initialOffset = segment.getOffset();
 110
 111     // Return value: whether to ask for more characters.
 112     bool maybeMore = false;
 113
 114     // All digits consumed so far.
 115     number::impl::DecimalQuantity digitsConsumed;
 116     digitsConsumed.bogus = true;
 117
 118     // The total number of digits after the decimal place, used for scaling the result.
 119     int32_t digitsAfterDecimalPlace = 0;
 120
 121     // The actual grouping and decimal separators used in the string.
 122     // If non-null, we have seen that token.
 123     UnicodeString actualGroupingString;
 124     UnicodeString actualDecimalString;
 125     actualGroupingString.setToBogus();
 126     actualDecimalString.setToBogus();
 127
 128     // Information for two groups: the previous group and the current group.
 129     //
 130     // Each group has three pieces of information:
 131     //
 132     // Offset: the string position of the beginning of the group, including a leading separator
 133     // if there was a leading separator. This is needed in case we need to rewind the parse to
 134     // that position.
 135     //
 136     // Separator type:
 137     // 0 => beginning of string
 138     // 1 => lead separator is a grouping separator
 139     // 2 => lead separator is a decimal separator
 140     //
 141     // Count: the number of digits in the group. If -1, the group has been validated.
 142     int32_t currGroupOffset = 0;
 143     int32_t currGroupSepType = 0;
 144     int32_t currGroupCount = 0;
 145     int32_t prevGroupOffset = -1;
 146     int32_t prevGroupSepType = -1;
 147     int32_t prevGroupCount = -1;
 148
 149     while (segment.length() > 0) {
 150         maybeMore = false;
 151
 152         // Attempt to match a digit.
 153         int8_t digit = -1;
 154
 155         // Try by code point digit value.
 156         UChar32 cp = segment.getCodePoint();
 157         if (u_isdigit(cp)) {
 158             segment.adjustOffset(U16_LENGTH(cp));
 159             digit = static_cast<int8_t>(u_digit(cp, 10));
 160         }
 161
 162         // Try by digit string.
 163         if (digit == -1 && !fLocalDigitStrings.isNull()) {
 164             for (int32_t i = 0; i < 10; i++) {
 165                 const UnicodeString& str = fLocalDigitStrings[i];
 166                 if (str.isEmpty()) {
 167                     continue;
 168                 }
 169                 // The following test is Apple-specific, for <rdar://7632623>;
 170                 // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
 171                 int32_t overlap = (segment.startsWith(0x96F6) && fLocalDigitStrings[0].charAt(0)==0x3007)?
 172                     1: segment.getCommonPrefixLength(str);
 173                 if (overlap == str.length()) {
 174                     segment.adjustOffset(overlap);
 175                     digit = static_cast<int8_t>(i);
 176                     break;
 177                 }
 178                 maybeMore = maybeMore || (overlap == segment.length());
 179             }
 180         }
 181
 182         if (digit >= 0) {
 183             // Digit was found.
 184             if (digitsConsumed.bogus) {
 185                 digitsConsumed.bogus = false;
 186                 digitsConsumed.clear();
 187             }
 188             digitsConsumed.appendDigit(digit, 0, true);
 189             currGroupCount++;
 190             if (!actualDecimalString.isBogus()) {
 191                 digitsAfterDecimalPlace++;
 192             }
 193             continue;
 194         }
 195
 196         // Attempt to match a literal grouping or decimal separator.
 197         bool isDecimal = false;
 198         bool isGrouping = false;
 199
 200         // 1) Attempt the decimal separator string literal.
 201         // if (we have not seen a decimal separator yet) { ... }
 202         if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
 203             int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
 204             maybeMore = maybeMore || (overlap == segment.length());
 205             if (overlap == decimalSeparator.length()) {
 206                 isDecimal = true;
 207                 actualDecimalString = decimalSeparator;
 208             }
 209         }
 210
 211         // 2) Attempt to match the actual grouping string literal.
 212         if (!actualGroupingString.isBogus()) {
 213             int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
 214             maybeMore = maybeMore || (overlap == segment.length());
 215             if (overlap == actualGroupingString.length()) {
 216                 isGrouping = true;
 217             }
 218         }
 219
 220         // 2.5) Attempt to match a new the grouping separator string literal.
 221         // if (we have not seen a grouping or decimal separator yet) { ... }
 222         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
 223             !groupingSeparator.isEmpty()) {
 224             int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
 225             maybeMore = maybeMore || (overlap == segment.length());
 226             if (overlap == groupingSeparator.length()) {
 227                 isGrouping = true;
 228                 actualGroupingString = groupingSeparator;
 229             }
 230         }
 231
 232         // 3) Attempt to match a decimal separator from the equivalence set.
 233         // if (we have not seen a decimal separator yet) { ... }
 234         // The !isGrouping is to confirm that we haven't yet matched the current character.
 235         if (!isGrouping && actualDecimalString.isBogus()) {
 236             if (decimalUniSet->contains(cp)) {
 237                 isDecimal = true;
 238                 actualDecimalString = UnicodeString(cp);
 239             }
 240         }
 241
 242         // 4) Attempt to match a grouping separator from the equivalence set.
 243         // if (we have not seen a grouping or decimal separator yet) { ... }
 244         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
 245             if (groupingUniSet->contains(cp)) {
 246                 isGrouping = true;
 247                 actualGroupingString = UnicodeString(cp);
 248             }
 249         }
 250
 251         // Leave if we failed to match this as a separator.
 252         if (!isDecimal && !isGrouping) {
 253             break;
 254         }
 255
 256         // Check for conditions when we don't want to accept the separator.
 257         if (isDecimal && integerOnly) {
 258             break;
 259         } else if (currGroupSepType == 2 && isGrouping) {
 260             // Fraction grouping
 261             break;
 262         }
 263
 264         // Validate intermediate grouping sizes.
 265         bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
 266         bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
 267         if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
 268             // Invalid grouping sizes.
 269             if (isGrouping && currGroupCount == 0) {
 270                 // Trailing grouping separators: these are taken care of below
 271                 U_ASSERT(currGroupSepType == 1);
 272             } else if (requireGroupingMatch) {
 273                 // Strict mode: reject the parse
 274                 digitsConsumed.clear();
 275                 digitsConsumed.bogus = true;
 276             }
 277             break;
 278         } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
 279             break;
 280         } else {
 281             // Grouping sizes OK so far.
 282             prevGroupOffset = currGroupOffset;
 283             prevGroupCount = currGroupCount;
 284             if (isDecimal) {
 285                 // Do not validate this group any more.
 286                 prevGroupSepType = -1;
 287             } else {
 288                 prevGroupSepType = currGroupSepType;
 289             }
 290         }
 291
 292         // OK to accept the separator.
 293         // Special case: don't update currGroup if it is empty; this allows two grouping
 294         // separators in a row in lenient mode.
 295         if (currGroupCount != 0) {
 296             currGroupOffset = segment.getOffset();
 297         }
 298         currGroupSepType = isGrouping ? 1 : 2;
 299         currGroupCount = 0;
 300         if (isGrouping) {
 301             segment.adjustOffset(actualGroupingString.length());
 302         } else {
 303             segment.adjustOffset(actualDecimalString.length());
 304         }
 305     }
 306
 307     // End of main loop.
 308     // Back up if there was a trailing grouping separator.
 309     // Shift prev -> curr so we can check it as a final group.
 310     if (currGroupSepType != 2 && currGroupCount == 0) {
 311         maybeMore = true;
 312         segment.setOffset(currGroupOffset);
 313         currGroupOffset = prevGroupOffset;
 314         currGroupSepType = prevGroupSepType;
 315         currGroupCount = prevGroupCount;
 316         prevGroupOffset = -1;
 317         prevGroupSepType = 0;
 318         prevGroupCount = 1;
 319     }
 320
 321     // Validate final grouping sizes.
 322     bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
 323     bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
 324     if (!requireGroupingMatch) {
 325         // The cases we need to handle here are lone digits.
 326         // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1)
 327         // See more examples in numberformattestspecification.txt
 328         int32_t digitsToRemove = 0;
 329         if (!prevValidSecondary) {
 330             segment.setOffset(prevGroupOffset);
 331             digitsToRemove += prevGroupCount;
 332             digitsToRemove += currGroupCount;
 333         } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
 334             maybeMore = true;
 335             segment.setOffset(currGroupOffset);
 336             digitsToRemove += currGroupCount;
 337         }
 338         if (digitsToRemove != 0) {
 339             digitsConsumed.adjustMagnitude(-digitsToRemove);
 340             digitsConsumed.truncate();
 341         }
 342         prevValidSecondary = true;
 343         currValidPrimary = true;
 344     }
 345     if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
 346         // Grouping failure.
 347         digitsConsumed.bogus = true;
 348     }
 349
 350     // Strings that start with a separator but have no digits,
 351     // or strings that failed a grouping size check.
 352     if (digitsConsumed.bogus) {
 353         maybeMore = maybeMore || (segment.length() == 0);
 354         segment.setOffset(initialOffset);
 355         return maybeMore;
 356     }
 357
 358     // We passed all inspections. Start post-processing.
 359
 360     // Adjust for fraction part.
 361     digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
 362
 363     // Set the digits, either normal or exponent.
 364     if (exponentSign != 0 && segment.getOffset() != initialOffset) {
 365         bool overflow = false;
 366         if (digitsConsumed.fitsInLong()) {
 367             int64_t exponentLong = digitsConsumed.toLong(false);
 368             U_ASSERT(exponentLong >= 0);
 369             if (exponentLong <= INT32_MAX) {
 370                 auto exponentInt = static_cast<int32_t>(exponentLong);
 371                 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
 372                     overflow = true;
 373                 }
 374             } else {
 375                 overflow = true;
 376             }
 377         } else {
 378             overflow = true;
 379         }
 380         if (overflow) {
 381             if (exponentSign == -1) {
 382                 // Set to zero
 383                 result.quantity.clear();
 384             } else {
 385                 // Set to infinity
 386                 result.quantity.bogus = true;
 387                 result.flags |= FLAG_INFINITY;
 388             }
 389         }
 390     } else {
 391         result.quantity = digitsConsumed;
 392     }
 393
 394     // Set other information into the result and return.
 395     if (!actualDecimalString.isBogus()) {
 396         result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
 397     }
 398     result.setCharsConsumed(segment);
 399     return segment.length() == 0 || maybeMore;
 400 }
 401
 402 bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
 403     if (requireGroupingMatch) {
 404         if (sepType == -1) {
 405             // No such group (prevGroup before first shift).
 406             return true;
 407         } else if (sepType == 0) {
 408             // First group.
 409             if (isPrimary) {
 410                 // No grouping separators is OK.
 411                 return true;
 412             } else {
 413                 // return count != 0 && count <= grouping2;
 414                 return count <= grouping2; // Apple <rdar://problem/38565910>, allow initial secondary group of 0
 415             }
 416         } else if (sepType == 1) {
 417             // Middle group.
 418             if (isPrimary) {
 419                 return count == grouping1;
 420             } else {
 421                 return count == grouping2;
 422             }
 423         } else {
 424             U_ASSERT(sepType == 2);
 425             // After the decimal separator.
 426             return true;
 427         }
 428     } else {
 429         if (sepType == 1) {
 430             // #11230: don't accept middle groups with only 1 digit.
 431             return count != 1;
 432         } else {
 433             return true;
 434         }
 435     }
 436 }
 437
 438 bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
 439     // The common case uses a static leadSet for efficiency.
 440     if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
 441         return segment.startsWith(*leadSet);
 442     }
 443     if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
 444         return true;
 445     }
 446     if (fLocalDigitStrings.isNull()) {
 447         return false;
 448     }
 449     // The following test is Apple-specific, for <rdar://7632623>;
 450     // if \u3007 is treated as 0 for parsing, \u96F6 should be too.
 451     if (segment.startsWith(0x96F6) && fLocalDigitStrings[0].length()==1 && fLocalDigitStrings[0].charAt(0)==0x3007) {
 452         return true;
 453     }
 454     for (int32_t i = 0; i < 10; i++) {
 455         if (segment.startsWith(fLocalDigitStrings[i])) {
 456             return true;
 457         }
 458     }
 459     return false;
 460 }
 461
 462 UnicodeString DecimalMatcher::toString() const {
 463     return u"<Decimal>";
 464 }
 465
 466
 467 #endif /* #if !UCONFIG_NO_FORMATTING */