1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 #include "unicode/utypes.h"
6 #if !UCONFIG_NO_FORMATTING
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
12 #include "numparse_types.h"
13 #include "numparse_affixes.h"
14 #include "numparse_utils.h"
15 #include "number_utils.h"
18 using namespace icu::numparse
;
19 using namespace icu::numparse::impl
;
20 using namespace icu::number
;
21 using namespace icu::number::impl
;
27 * Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
28 * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
29 * the given pattern string.
31 static bool matched(const AffixPatternMatcher
* affix
, const UnicodeString
& patternString
) {
32 return (affix
== nullptr && patternString
.isBogus()) ||
33 (affix
!= nullptr && affix
->getPattern() == patternString
);
37 * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
39 static int32_t length(const AffixPatternMatcher
* matcher
) {
40 return matcher
== nullptr ? 0 : matcher
->getPattern().length();
44 * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
45 * valid, whether they are equal according to operator==. Similar to Java Objects.equals()
47 static bool equals(const AffixPatternMatcher
* lhs
, const AffixPatternMatcher
* rhs
) {
48 if (lhs
== nullptr && rhs
== nullptr) {
51 if (lhs
== nullptr || rhs
== nullptr) {
60 AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString
& pattern
,
61 AffixTokenMatcherWarehouse
& warehouse
,
62 IgnorablesMatcher
* ignorables
)
66 fWarehouse(warehouse
),
67 fIgnorables(ignorables
) {}
69 void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type
, UChar32 cp
, UErrorCode
& status
) {
70 // This is called by AffixUtils.iterateWithConsumer() for each token.
72 // Add an ignorables matcher between tokens except between two literals, and don't put two
73 // ignorables matchers in a row.
74 if (fIgnorables
!= nullptr && fMatchersLen
> 0 &&
75 (fLastTypeOrCp
< 0 || !fIgnorables
->getSet()->contains(fLastTypeOrCp
))) {
76 addMatcher(*fIgnorables
);
79 if (type
!= TYPE_CODEPOINT
) {
80 // Case 1: the token is a symbol.
83 addMatcher(fWarehouse
.minusSign());
86 addMatcher(fWarehouse
.plusSign());
89 addMatcher(fWarehouse
.percent());
92 addMatcher(fWarehouse
.permille());
94 case TYPE_CURRENCY_SINGLE
:
95 case TYPE_CURRENCY_DOUBLE
:
96 case TYPE_CURRENCY_TRIPLE
:
97 case TYPE_CURRENCY_QUAD
:
98 case TYPE_CURRENCY_QUINT
:
99 // All currency symbols use the same matcher
100 addMatcher(fWarehouse
.currency(status
));
106 } else if (fIgnorables
!= nullptr && fIgnorables
->getSet()->contains(cp
)) {
107 // Case 2: the token is an ignorable literal.
108 // No action necessary: the ignorables matcher has already been added.
111 // Case 3: the token is a non-ignorable literal.
112 if (auto* ptr
= fWarehouse
.nextCodePointMatcher(cp
, status
)) {
115 // OOM; unwind the stack
119 fLastTypeOrCp
= type
!= TYPE_CODEPOINT
? type
: cp
;
122 void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher
& matcher
) {
123 if (fMatchersLen
>= fMatchers
.getCapacity()) {
124 fMatchers
.resize(fMatchersLen
* 2, fMatchersLen
);
126 fMatchers
[fMatchersLen
++] = &matcher
;
129 AffixPatternMatcher
AffixPatternMatcherBuilder::build() {
130 return AffixPatternMatcher(fMatchers
, fMatchersLen
, fPattern
);
133 AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData
* setupData
)
134 : fSetupData(setupData
) {}
136 NumberParseMatcher
& AffixTokenMatcherWarehouse::minusSign() {
137 return fMinusSign
= {fSetupData
->dfs
, true};
140 NumberParseMatcher
& AffixTokenMatcherWarehouse::plusSign() {
141 return fPlusSign
= {fSetupData
->dfs
, true};
144 NumberParseMatcher
& AffixTokenMatcherWarehouse::percent() {
145 return fPercent
= {fSetupData
->dfs
};
148 NumberParseMatcher
& AffixTokenMatcherWarehouse::permille() {
149 return fPermille
= {fSetupData
->dfs
};
152 NumberParseMatcher
& AffixTokenMatcherWarehouse::currency(UErrorCode
& status
) {
153 return fCurrency
= {fSetupData
->currencySymbols
, fSetupData
->dfs
, fSetupData
->parseFlags
, status
};
156 IgnorablesMatcher
& AffixTokenMatcherWarehouse::ignorables() {
157 return fSetupData
->ignorables
;
160 NumberParseMatcher
* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp
, UErrorCode
& status
) {
161 if (U_FAILURE(status
)) {
164 auto* result
= fCodePoints
.create(cp
);
165 if (result
== nullptr) {
166 status
= U_MEMORY_ALLOCATION_ERROR
;
172 CodePointMatcher::CodePointMatcher(UChar32 cp
)
175 bool CodePointMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, UErrorCode
&) const {
176 if (segment
.startsWith(fCp
)) {
177 segment
.adjustOffsetByCodePoint();
178 result
.setCharsConsumed(segment
);
183 bool CodePointMatcher::smokeTest(const StringSegment
& segment
) const {
184 return segment
.startsWith(fCp
);
187 UnicodeString
CodePointMatcher::toString() const {
188 return u
"<CodePoint>";
192 AffixPatternMatcher
AffixPatternMatcher::fromAffixPattern(const UnicodeString
& affixPattern
,
193 AffixTokenMatcherWarehouse
& tokenWarehouse
,
194 parse_flags_t parseFlags
, bool* success
,
195 UErrorCode
& status
) {
196 if (affixPattern
.isEmpty()) {
202 IgnorablesMatcher
* ignorables
;
203 if (0 != (parseFlags
& PARSE_FLAG_EXACT_AFFIX
)) {
204 ignorables
= nullptr;
206 ignorables
= &tokenWarehouse
.ignorables();
209 AffixPatternMatcherBuilder
builder(affixPattern
, tokenWarehouse
, ignorables
);
210 AffixUtils::iterateWithConsumer(affixPattern
, builder
, status
);
211 return builder
.build();
214 AffixPatternMatcher::AffixPatternMatcher(MatcherArray
& matchers
, int32_t matchersLen
,
215 const UnicodeString
& pattern
)
216 : ArraySeriesMatcher(matchers
, matchersLen
), fPattern(pattern
) {}
218 UnicodeString
AffixPatternMatcher::getPattern() const {
219 return fPattern
.toAliasedUnicodeString();
222 bool AffixPatternMatcher::operator==(const AffixPatternMatcher
& other
) const {
223 return fPattern
== other
.fPattern
;
227 AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse
* tokenWarehouse
)
228 : fTokenWarehouse(tokenWarehouse
) {
231 bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider
& patternInfo
,
232 const IgnorablesMatcher
& ignorables
, parse_flags_t parseFlags
,
233 UErrorCode
& status
) {
234 UnicodeString posPrefixString
= patternInfo
.getString(AffixPatternProvider::AFFIX_POS_PREFIX
);
235 UnicodeString posSuffixString
= patternInfo
.getString(AffixPatternProvider::AFFIX_POS_SUFFIX
);
236 UnicodeString negPrefixString
;
237 UnicodeString negSuffixString
;
238 if (patternInfo
.hasNegativeSubpattern()) {
239 negPrefixString
= patternInfo
.getString(AffixPatternProvider::AFFIX_NEG_PREFIX
);
240 negSuffixString
= patternInfo
.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX
);
243 if (0 == (parseFlags
& PARSE_FLAG_USE_FULL_AFFIXES
) &&
244 AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString
, *ignorables
.getSet(), status
) &&
245 AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString
, *ignorables
.getSet(), status
) &&
246 AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString
, *ignorables
.getSet(), status
) &&
247 AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString
, *ignorables
.getSet(), status
)
248 // HACK: Plus and minus sign are a special case: we accept them trailing only if they are
249 // trailing in the pattern string.
250 && !AffixUtils::containsType(posSuffixString
, TYPE_PLUS_SIGN
, status
) &&
251 !AffixUtils::containsType(posSuffixString
, TYPE_MINUS_SIGN
, status
) &&
252 !AffixUtils::containsType(negSuffixString
, TYPE_PLUS_SIGN
, status
) &&
253 !AffixUtils::containsType(negSuffixString
, TYPE_MINUS_SIGN
, status
)) {
254 // The affixes contain only symbols and ignorables.
255 // No need to generate affix matchers.
261 void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider
& patternInfo
,
262 MutableMatcherCollection
& output
,
263 const IgnorablesMatcher
& ignorables
,
264 parse_flags_t parseFlags
, UErrorCode
& status
) {
265 if (!isInteresting(patternInfo
, ignorables
, parseFlags
, status
)) {
269 // The affixes have interesting characters, or we are in strict mode.
270 // Use initial capacity of 6, the highest possible number of AffixMatchers.
272 bool includeUnpaired
= 0 != (parseFlags
& PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES
);
273 UNumberSignDisplay signDisplay
= (0 != (parseFlags
& PARSE_FLAG_PLUS_SIGN_ALLOWED
)) ? UNUM_SIGN_ALWAYS
276 int32_t numAffixMatchers
= 0;
277 int32_t numAffixPatternMatchers
= 0;
279 AffixPatternMatcher
* posPrefix
= nullptr;
280 AffixPatternMatcher
* posSuffix
= nullptr;
282 // Pre-process the affix strings to resolve LDML rules like sign display.
283 for (int8_t signum
= 1; signum
>= -1; signum
--) {
285 bool hasPrefix
= false;
286 PatternStringUtils::patternInfoToStringBuilder(
287 patternInfo
, true, signum
, signDisplay
, StandardPlural::OTHER
, false, sb
);
288 fAffixPatternMatchers
[numAffixPatternMatchers
] = AffixPatternMatcher::fromAffixPattern(
289 sb
, *fTokenWarehouse
, parseFlags
, &hasPrefix
, status
);
290 AffixPatternMatcher
* prefix
= hasPrefix
? &fAffixPatternMatchers
[numAffixPatternMatchers
++]
294 bool hasSuffix
= false;
295 PatternStringUtils::patternInfoToStringBuilder(
296 patternInfo
, false, signum
, signDisplay
, StandardPlural::OTHER
, false, sb
);
297 fAffixPatternMatchers
[numAffixPatternMatchers
] = AffixPatternMatcher::fromAffixPattern(
298 sb
, *fTokenWarehouse
, parseFlags
, &hasSuffix
, status
);
299 AffixPatternMatcher
* suffix
= hasSuffix
? &fAffixPatternMatchers
[numAffixPatternMatchers
++]
305 } else if (equals(prefix
, posPrefix
) && equals(suffix
, posSuffix
)) {
306 // Skip adding these matchers (we already have equivalents)
310 // Flags for setting in the ParsedNumber; the token matchers may add more.
311 int flags
= (signum
== -1) ? FLAG_NEGATIVE
: 0;
313 // Note: it is indeed possible for posPrefix and posSuffix to both be null.
314 // We still need to add that matcher for strict mode to work.
315 fAffixMatchers
[numAffixMatchers
++] = {prefix
, suffix
, flags
};
316 if (includeUnpaired
&& prefix
!= nullptr && suffix
!= nullptr) {
317 // The following if statements are designed to prevent adding two identical matchers.
318 if (signum
== 1 || !equals(prefix
, posPrefix
)) {
319 fAffixMatchers
[numAffixMatchers
++] = {prefix
, nullptr, flags
};
321 if (signum
== 1 || !equals(suffix
, posSuffix
)) {
322 fAffixMatchers
[numAffixMatchers
++] = {nullptr, suffix
, flags
};
327 // Put the AffixMatchers in order, and then add them to the output.
328 // Since there are at most 9 elements, do a simple-to-implement bubble sort.
332 for (int32_t i
= 1; i
< numAffixMatchers
; i
++) {
333 if (fAffixMatchers
[i
- 1].compareTo(fAffixMatchers
[i
]) > 0) {
335 AffixMatcher temp
= std::move(fAffixMatchers
[i
- 1]);
336 fAffixMatchers
[i
- 1] = std::move(fAffixMatchers
[i
]);
337 fAffixMatchers
[i
] = std::move(temp
);
340 } while (madeChanges
);
342 for (int32_t i
= 0; i
< numAffixMatchers
; i
++) {
343 // Enable the following line to debug affixes
344 //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
345 output
.addMatcher(fAffixMatchers
[i
]);
350 AffixMatcher::AffixMatcher(AffixPatternMatcher
* prefix
, AffixPatternMatcher
* suffix
, result_flags_t flags
)
351 : fPrefix(prefix
), fSuffix(suffix
), fFlags(flags
) {}
353 bool AffixMatcher::match(StringSegment
& segment
, ParsedNumber
& result
, UErrorCode
& status
) const {
354 bool startCurrencyIsEmpty
= (result
.currencyCode
[0]==0); // Apple fix for <rdar://problem/46915356>
355 if (!result
.seenNumber()) {
358 // 1. We have already seen a prefix (result.prefix != null)
359 // 2. The prefix in this AffixMatcher is empty (prefix == null)
360 if (!result
.prefix
.isBogus() || fPrefix
== nullptr) {
364 // Attempt to match the prefix.
365 int initialOffset
= segment
.getOffset();
366 bool maybeMore
= fPrefix
->match(segment
, result
, status
);
367 if (initialOffset
!= segment
.getOffset()
368 || (startCurrencyIsEmpty
&& result
.currencyCode
[0]!=0)) { // Apple fix for <rdar://problem/46915356>
369 result
.prefix
= fPrefix
->getPattern();
376 // 1. We have already seen a suffix (result.suffix != null)
377 // 2. The suffix in this AffixMatcher is empty (suffix == null)
378 // 3. The matched prefix does not equal this AffixMatcher's prefix
379 if (!result
.suffix
.isBogus() || fSuffix
== nullptr || !matched(fPrefix
, result
.prefix
)) {
383 // Attempt to match the suffix.
384 int initialOffset
= segment
.getOffset();
385 bool maybeMore
= fSuffix
->match(segment
, result
, status
);
386 if (initialOffset
!= segment
.getOffset()
387 || (startCurrencyIsEmpty
&& result
.currencyCode
[0]!=0)) { // Apple fix for <rdar://problem/46915356>
388 result
.suffix
= fSuffix
->getPattern();
394 bool AffixMatcher::smokeTest(const StringSegment
& segment
) const {
395 return (fPrefix
!= nullptr && fPrefix
->smokeTest(segment
)) ||
396 (fSuffix
!= nullptr && fSuffix
->smokeTest(segment
));
399 void AffixMatcher::postProcess(ParsedNumber
& result
) const {
400 // Check to see if our affix is the one that was matched. If so, set the flags in the result.
401 if (matched(fPrefix
, result
.prefix
) && matched(fSuffix
, result
.suffix
)) {
402 // Fill in the result prefix and suffix with non-null values (empty string).
403 // Used by strict mode to determine whether an entire affix pair was matched.
404 if (result
.prefix
.isBogus()) {
405 result
.prefix
= UnicodeString();
407 if (result
.suffix
.isBogus()) {
408 result
.suffix
= UnicodeString();
410 result
.flags
|= fFlags
;
411 if (fPrefix
!= nullptr) {
412 fPrefix
->postProcess(result
);
414 if (fSuffix
!= nullptr) {
415 fSuffix
->postProcess(result
);
420 int8_t AffixMatcher::compareTo(const AffixMatcher
& rhs
) const {
421 const AffixMatcher
& lhs
= *this;
422 if (length(lhs
.fPrefix
) != length(rhs
.fPrefix
)) {
423 return length(lhs
.fPrefix
) > length(rhs
.fPrefix
) ? -1 : 1;
424 } else if (length(lhs
.fSuffix
) != length(rhs
.fSuffix
)) {
425 return length(lhs
.fSuffix
) > length(rhs
.fSuffix
) ? -1 : 1;
431 UnicodeString
AffixMatcher::toString() const {
432 bool isNegative
= 0 != (fFlags
& FLAG_NEGATIVE
);
433 return UnicodeString(u
"<Affix") + (isNegative
? u
":negative " : u
" ") +
434 (fPrefix
? fPrefix
->getPattern() : u
"null") + u
"#" +
435 (fSuffix
? fSuffix
->getPattern() : u
"null") + u
">";
440 #endif /* #if !UCONFIG_NO_FORMATTING */