1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 #include "unicode/utypes.h"
6 #if !UCONFIG_NO_FORMATTING
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
12 #include "numparse_types.h"
13 #include "numparse_affixes.h"
14 #include "numparse_utils.h"
15 #include "number_utils.h"
18 using namespace icu::numparse
;
19 using namespace icu::numparse::impl
;
20 using namespace icu::number
;
21 using namespace icu::number::impl
;
27 * Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
28 * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
29 * the given pattern string.
31 static bool matched(const AffixPatternMatcher
* affix
, const UnicodeString
& patternString
) {
32 return (affix
== nullptr && patternString
.isBogus()) ||
33 (affix
!= nullptr && affix
->getPattern() == patternString
);
37 * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
39 static int32_t length(const AffixPatternMatcher
* matcher
) {
40 return matcher
== nullptr ? 0 : matcher
->getPattern().length();
44 * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
45 * valid, whether they are equal according to operator==. Similar to Java Objects.equals()
47 static bool equals(const AffixPatternMatcher
* lhs
, const AffixPatternMatcher
* rhs
) {
48 if (lhs
== nullptr && rhs
== nullptr) {
51 if (lhs
== nullptr || rhs
== nullptr) {
60 AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString
& pattern
,
61 AffixTokenMatcherWarehouse
& warehouse
,
62 IgnorablesMatcher
* ignorables
)
66 fWarehouse(warehouse
),
67 fIgnorables(ignorables
) {}
69 void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type
, UChar32 cp
, UErrorCode
& status
) {
70 // This is called by AffixUtils.iterateWithConsumer() for each token.
72 // Add an ignorables matcher between tokens except between two literals, and don't put two
73 // ignorables matchers in a row.
74 if (fIgnorables
!= nullptr && fMatchersLen
> 0 &&
75 (fLastTypeOrCp
< 0 || !fIgnorables
->getSet()->contains(fLastTypeOrCp
))) {
76 addMatcher(*fIgnorables
);
79 if (type
!= TYPE_CODEPOINT
) {
80 // Case 1: the token is a symbol.
83 addMatcher(fWarehouse
.minusSign());
86 addMatcher(fWarehouse
.plusSign());
89 addMatcher(fWarehouse
.percent());
92 addMatcher(fWarehouse
.permille());
94 case TYPE_CURRENCY_SINGLE
:
95 case TYPE_CURRENCY_DOUBLE
:
96 case TYPE_CURRENCY_TRIPLE
:
97 case TYPE_CURRENCY_QUAD
:
98 case TYPE_CURRENCY_QUINT
:
99 // All currency symbols use the same matcher
100 addMatcher(fWarehouse
.currency(status
));
106 } else if (fIgnorables
!= nullptr && fIgnorables
->getSet()->contains(cp
)) {
107 // Case 2: the token is an ignorable literal.
108 // No action necessary: the ignorables matcher has already been added.
111 // Case 3: the token is a non-ignorable literal.
112 addMatcher(fWarehouse
.nextCodePointMatcher(cp
));
114 fLastTypeOrCp
= type
!= TYPE_CODEPOINT
? type
: cp
;
117 void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher
& matcher
) {
118 if (fMatchersLen
>= fMatchers
.getCapacity()) {
119 fMatchers
.resize(fMatchersLen
* 2, fMatchersLen
);
121 fMatchers
[fMatchersLen
++] = &matcher
;
124 AffixPatternMatcher
AffixPatternMatcherBuilder::build() {
125 return AffixPatternMatcher(fMatchers
, fMatchersLen
, fPattern
);
129 CodePointMatcherWarehouse::CodePointMatcherWarehouse()
130 : codePointCount(0), codePointNumBatches(0) {}
132 CodePointMatcherWarehouse::~CodePointMatcherWarehouse() {
133 // Delete the variable number of batches of code point matchers
134 for (int32_t i
= 0; i
< codePointNumBatches
; i
++) {
135 delete[] codePointsOverflow
[i
];
139 CodePointMatcherWarehouse::CodePointMatcherWarehouse(CodePointMatcherWarehouse
&& src
) U_NOEXCEPT
140 : codePoints(std::move(src
.codePoints
)),
141 codePointsOverflow(std::move(src
.codePointsOverflow
)),
142 codePointCount(src
.codePointCount
),
143 codePointNumBatches(src
.codePointNumBatches
) {}
145 CodePointMatcherWarehouse
&
146 CodePointMatcherWarehouse::operator=(CodePointMatcherWarehouse
&& src
) U_NOEXCEPT
{
147 codePoints
= std::move(src
.codePoints
);
148 codePointsOverflow
= std::move(src
.codePointsOverflow
);
149 codePointCount
= src
.codePointCount
;
150 codePointNumBatches
= src
.codePointNumBatches
;
154 NumberParseMatcher
& CodePointMatcherWarehouse::nextCodePointMatcher(UChar32 cp
) {
155 if (codePointCount
< CODE_POINT_STACK_CAPACITY
) {
156 return codePoints
[codePointCount
++] = {cp
};
158 int32_t totalCapacity
= CODE_POINT_STACK_CAPACITY
+ codePointNumBatches
* CODE_POINT_BATCH_SIZE
;
159 if (codePointCount
>= totalCapacity
) {
161 auto* nextBatch
= new CodePointMatcher
[CODE_POINT_BATCH_SIZE
];
162 if (codePointNumBatches
>= codePointsOverflow
.getCapacity()) {
163 // Need more room for storing pointers to batches
164 codePointsOverflow
.resize(codePointNumBatches
* 2, codePointNumBatches
);
166 codePointsOverflow
[codePointNumBatches
++] = nextBatch
;
168 return codePointsOverflow
[codePointNumBatches
- 1][(codePointCount
++ - CODE_POINT_STACK_CAPACITY
) %
169 CODE_POINT_BATCH_SIZE
] = {cp
};
173 AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData
* setupData
)
174 : fSetupData(setupData
) {}
176 NumberParseMatcher
& AffixTokenMatcherWarehouse::minusSign() {
177 return fMinusSign
= {fSetupData
->dfs
, true};
180 NumberParseMatcher
& AffixTokenMatcherWarehouse::plusSign() {
181 return fPlusSign
= {fSetupData
->dfs
, true};
184 NumberParseMatcher
& AffixTokenMatcherWarehouse::percent() {
185 return fPercent
= {fSetupData
->dfs
};
188 NumberParseMatcher
& AffixTokenMatcherWarehouse::permille() {
189 return fPermille
= {fSetupData
->dfs
};
192 NumberParseMatcher
& AffixTokenMatcherWarehouse::currency(UErrorCode
& status
) {
193 return fCurrency
= {fSetupData
->currencySymbols
, fSetupData
->dfs
, fSetupData
->parseFlags
, status
};
196 IgnorablesMatcher
& AffixTokenMatcherWarehouse::ignorables() {
197 return fSetupData
->ignorables
;
200 NumberParseMatcher
& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp
) {
201 return fCodePoints
.nextCodePointMatcher(cp
);
205 CodePointMatcher::CodePointMatcher(UChar32 cp
)
208 bool CodePointMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, UErrorCode
&) const {
209 if (segment
.startsWith(fCp
)) {
210 segment
.adjustOffsetByCodePoint();
211 result
.setCharsConsumed(segment
);
216 bool CodePointMatcher::smokeTest(const StringSegment
& segment
) const {
217 return segment
.startsWith(fCp
);
220 UnicodeString
CodePointMatcher::toString() const {
221 return u
"<CodePoint>";
225 AffixPatternMatcher
AffixPatternMatcher::fromAffixPattern(const UnicodeString
& affixPattern
,
226 AffixTokenMatcherWarehouse
& tokenWarehouse
,
227 parse_flags_t parseFlags
, bool* success
,
228 UErrorCode
& status
) {
229 if (affixPattern
.isEmpty()) {
235 IgnorablesMatcher
* ignorables
;
236 if (0 != (parseFlags
& PARSE_FLAG_EXACT_AFFIX
)) {
237 ignorables
= nullptr;
239 ignorables
= &tokenWarehouse
.ignorables();
242 AffixPatternMatcherBuilder
builder(affixPattern
, tokenWarehouse
, ignorables
);
243 AffixUtils::iterateWithConsumer(affixPattern
, builder
, status
);
244 return builder
.build();
247 AffixPatternMatcher::AffixPatternMatcher(MatcherArray
& matchers
, int32_t matchersLen
,
248 const UnicodeString
& pattern
)
249 : ArraySeriesMatcher(matchers
, matchersLen
), fPattern(pattern
) {}
251 UnicodeString
AffixPatternMatcher::getPattern() const {
252 return fPattern
.toAliasedUnicodeString();
255 bool AffixPatternMatcher::operator==(const AffixPatternMatcher
& other
) const {
256 return fPattern
== other
.fPattern
;
260 AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse
* tokenWarehouse
)
261 : fTokenWarehouse(tokenWarehouse
) {
264 bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider
& patternInfo
,
265 const IgnorablesMatcher
& ignorables
, parse_flags_t parseFlags
,
266 UErrorCode
& status
) {
267 UnicodeString posPrefixString
= patternInfo
.getString(AffixPatternProvider::AFFIX_POS_PREFIX
);
268 UnicodeString posSuffixString
= patternInfo
.getString(AffixPatternProvider::AFFIX_POS_SUFFIX
);
269 UnicodeString negPrefixString
;
270 UnicodeString negSuffixString
;
271 if (patternInfo
.hasNegativeSubpattern()) {
272 negPrefixString
= patternInfo
.getString(AffixPatternProvider::AFFIX_NEG_PREFIX
);
273 negSuffixString
= patternInfo
.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX
);
276 if (0 == (parseFlags
& PARSE_FLAG_USE_FULL_AFFIXES
) &&
277 AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString
, *ignorables
.getSet(), status
) &&
278 AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString
, *ignorables
.getSet(), status
) &&
279 AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString
, *ignorables
.getSet(), status
) &&
280 AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString
, *ignorables
.getSet(), status
)
281 // HACK: Plus and minus sign are a special case: we accept them trailing only if they are
282 // trailing in the pattern string.
283 && !AffixUtils::containsType(posSuffixString
, TYPE_PLUS_SIGN
, status
) &&
284 !AffixUtils::containsType(posSuffixString
, TYPE_MINUS_SIGN
, status
) &&
285 !AffixUtils::containsType(negSuffixString
, TYPE_PLUS_SIGN
, status
) &&
286 !AffixUtils::containsType(negSuffixString
, TYPE_MINUS_SIGN
, status
)) {
287 // The affixes contain only symbols and ignorables.
288 // No need to generate affix matchers.
294 void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider
& patternInfo
,
295 MutableMatcherCollection
& output
,
296 const IgnorablesMatcher
& ignorables
,
297 parse_flags_t parseFlags
, UErrorCode
& status
) {
298 if (!isInteresting(patternInfo
, ignorables
, parseFlags
, status
)) {
302 // The affixes have interesting characters, or we are in strict mode.
303 // Use initial capacity of 6, the highest possible number of AffixMatchers.
305 bool includeUnpaired
= 0 != (parseFlags
& PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES
);
306 UNumberSignDisplay signDisplay
= (0 != (parseFlags
& PARSE_FLAG_PLUS_SIGN_ALLOWED
)) ? UNUM_SIGN_ALWAYS
309 int32_t numAffixMatchers
= 0;
310 int32_t numAffixPatternMatchers
= 0;
312 AffixPatternMatcher
* posPrefix
= nullptr;
313 AffixPatternMatcher
* posSuffix
= nullptr;
315 // Pre-process the affix strings to resolve LDML rules like sign display.
316 for (int8_t signum
= 1; signum
>= -1; signum
--) {
318 bool hasPrefix
= false;
319 PatternStringUtils::patternInfoToStringBuilder(
320 patternInfo
, true, signum
, signDisplay
, StandardPlural::OTHER
, false, sb
);
321 fAffixPatternMatchers
[numAffixPatternMatchers
] = AffixPatternMatcher::fromAffixPattern(
322 sb
, *fTokenWarehouse
, parseFlags
, &hasPrefix
, status
);
323 AffixPatternMatcher
* prefix
= hasPrefix
? &fAffixPatternMatchers
[numAffixPatternMatchers
++]
327 bool hasSuffix
= false;
328 PatternStringUtils::patternInfoToStringBuilder(
329 patternInfo
, false, signum
, signDisplay
, StandardPlural::OTHER
, false, sb
);
330 fAffixPatternMatchers
[numAffixPatternMatchers
] = AffixPatternMatcher::fromAffixPattern(
331 sb
, *fTokenWarehouse
, parseFlags
, &hasSuffix
, status
);
332 AffixPatternMatcher
* suffix
= hasSuffix
? &fAffixPatternMatchers
[numAffixPatternMatchers
++]
338 } else if (equals(prefix
, posPrefix
) && equals(suffix
, posSuffix
)) {
339 // Skip adding these matchers (we already have equivalents)
343 // Flags for setting in the ParsedNumber; the token matchers may add more.
344 int flags
= (signum
== -1) ? FLAG_NEGATIVE
: 0;
346 // Note: it is indeed possible for posPrefix and posSuffix to both be null.
347 // We still need to add that matcher for strict mode to work.
348 fAffixMatchers
[numAffixMatchers
++] = {prefix
, suffix
, flags
};
349 if (includeUnpaired
&& prefix
!= nullptr && suffix
!= nullptr) {
350 // The following if statements are designed to prevent adding two identical matchers.
351 if (signum
== 1 || !equals(prefix
, posPrefix
)) {
352 fAffixMatchers
[numAffixMatchers
++] = {prefix
, nullptr, flags
};
354 if (signum
== 1 || !equals(suffix
, posSuffix
)) {
355 fAffixMatchers
[numAffixMatchers
++] = {nullptr, suffix
, flags
};
360 // Put the AffixMatchers in order, and then add them to the output.
361 // Since there are at most 9 elements, do a simple-to-implement bubble sort.
365 for (int32_t i
= 1; i
< numAffixMatchers
; i
++) {
366 if (fAffixMatchers
[i
- 1].compareTo(fAffixMatchers
[i
]) > 0) {
368 AffixMatcher temp
= std::move(fAffixMatchers
[i
- 1]);
369 fAffixMatchers
[i
- 1] = std::move(fAffixMatchers
[i
]);
370 fAffixMatchers
[i
] = std::move(temp
);
373 } while (madeChanges
);
375 for (int32_t i
= 0; i
< numAffixMatchers
; i
++) {
376 // Enable the following line to debug affixes
377 //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
378 output
.addMatcher(fAffixMatchers
[i
]);
383 AffixMatcher::AffixMatcher(AffixPatternMatcher
* prefix
, AffixPatternMatcher
* suffix
, result_flags_t flags
)
384 : fPrefix(prefix
), fSuffix(suffix
), fFlags(flags
) {}
386 bool AffixMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, UErrorCode
& status
) const {
387 if (!result
.seenNumber()) {
390 // 1. We have already seen a prefix (result.prefix != null)
391 // 2. The prefix in this AffixMatcher is empty (prefix == null)
392 if (!result
.prefix
.isBogus() || fPrefix
== nullptr) {
396 // Attempt to match the prefix.
397 int initialOffset
= segment
.getOffset();
398 bool maybeMore
= fPrefix
->match(segment
, result
, status
);
399 if (initialOffset
!= segment
.getOffset()) {
400 result
.prefix
= fPrefix
->getPattern();
407 // 1. We have already seen a suffix (result.suffix != null)
408 // 2. The suffix in this AffixMatcher is empty (suffix == null)
409 // 3. The matched prefix does not equal this AffixMatcher's prefix
410 if (!result
.suffix
.isBogus() || fSuffix
== nullptr || !matched(fPrefix
, result
.prefix
)) {
414 // Attempt to match the suffix.
415 int initialOffset
= segment
.getOffset();
416 bool maybeMore
= fSuffix
->match(segment
, result
, status
);
417 if (initialOffset
!= segment
.getOffset()) {
418 result
.suffix
= fSuffix
->getPattern();
424 bool AffixMatcher::smokeTest(const StringSegment
& segment
) const {
425 return (fPrefix
!= nullptr && fPrefix
->smokeTest(segment
)) ||
426 (fSuffix
!= nullptr && fSuffix
->smokeTest(segment
));
429 void AffixMatcher::postProcess(ParsedNumber
& result
) const {
430 // Check to see if our affix is the one that was matched. If so, set the flags in the result.
431 if (matched(fPrefix
, result
.prefix
) && matched(fSuffix
, result
.suffix
)) {
432 // Fill in the result prefix and suffix with non-null values (empty string).
433 // Used by strict mode to determine whether an entire affix pair was matched.
434 if (result
.prefix
.isBogus()) {
435 result
.prefix
= UnicodeString();
437 if (result
.suffix
.isBogus()) {
438 result
.suffix
= UnicodeString();
440 result
.flags
|= fFlags
;
441 if (fPrefix
!= nullptr) {
442 fPrefix
->postProcess(result
);
444 if (fSuffix
!= nullptr) {
445 fSuffix
->postProcess(result
);
450 int8_t AffixMatcher::compareTo(const AffixMatcher
& rhs
) const {
451 const AffixMatcher
& lhs
= *this;
452 if (length(lhs
.fPrefix
) != length(rhs
.fPrefix
)) {
453 return length(lhs
.fPrefix
) > length(rhs
.fPrefix
) ? -1 : 1;
454 } else if (length(lhs
.fSuffix
) != length(rhs
.fSuffix
)) {
455 return length(lhs
.fSuffix
) > length(rhs
.fSuffix
) ? -1 : 1;
461 UnicodeString
AffixMatcher::toString() const {
462 bool isNegative
= 0 != (fFlags
& FLAG_NEGATIVE
);
463 return UnicodeString(u
"<Affix") + (isNegative
? u
":negative " : u
" ") +
464 (fPrefix
? fPrefix
->getPattern() : u
"null") + u
"#" +
465 (fSuffix
? fSuffix
->getPattern() : u
"null") + u
">";
470 #endif /* #if !UCONFIG_NO_FORMATTING */