]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/numparse_affixes.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / numparse_affixes.cpp
CommitLineData
0f5d89e8
A
1// © 2018 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4#include "unicode/utypes.h"
5
6#if !UCONFIG_NO_FORMATTING
7
8// Allow implicit conversion from char16_t* to UnicodeString for this file:
9// Helpful in toString methods and elsewhere.
10#define UNISTR_FROM_STRING_EXPLICIT
11
12#include "numparse_types.h"
13#include "numparse_affixes.h"
14#include "numparse_utils.h"
15#include "number_utils.h"
16
17using namespace icu;
18using namespace icu::numparse;
19using namespace icu::numparse::impl;
20using namespace icu::number;
21using namespace icu::number::impl;
22
23
24namespace {
25
26/**
27 * Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
28 * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
29 * the given pattern string.
30 */
31static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) {
32 return (affix == nullptr && patternString.isBogus()) ||
33 (affix != nullptr && affix->getPattern() == patternString);
34}
35
36/**
37 * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
38 */
39static int32_t length(const AffixPatternMatcher* matcher) {
40 return matcher == nullptr ? 0 : matcher->getPattern().length();
41}
42
43/**
44 * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
45 * valid, whether they are equal according to operator==. Similar to Java Objects.equals()
46 */
47static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) {
48 if (lhs == nullptr && rhs == nullptr) {
49 return true;
50 }
51 if (lhs == nullptr || rhs == nullptr) {
52 return false;
53 }
54 return *lhs == *rhs;
55}
56
57}
58
59
60AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern,
61 AffixTokenMatcherWarehouse& warehouse,
62 IgnorablesMatcher* ignorables)
63 : fMatchersLen(0),
64 fLastTypeOrCp(0),
65 fPattern(pattern),
66 fWarehouse(warehouse),
67 fIgnorables(ignorables) {}
68
69void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) {
70 // This is called by AffixUtils.iterateWithConsumer() for each token.
71
72 // Add an ignorables matcher between tokens except between two literals, and don't put two
73 // ignorables matchers in a row.
74 if (fIgnorables != nullptr && fMatchersLen > 0 &&
75 (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) {
76 addMatcher(*fIgnorables);
77 }
78
79 if (type != TYPE_CODEPOINT) {
80 // Case 1: the token is a symbol.
81 switch (type) {
82 case TYPE_MINUS_SIGN:
83 addMatcher(fWarehouse.minusSign());
84 break;
85 case TYPE_PLUS_SIGN:
86 addMatcher(fWarehouse.plusSign());
87 break;
88 case TYPE_PERCENT:
89 addMatcher(fWarehouse.percent());
90 break;
91 case TYPE_PERMILLE:
92 addMatcher(fWarehouse.permille());
93 break;
94 case TYPE_CURRENCY_SINGLE:
95 case TYPE_CURRENCY_DOUBLE:
96 case TYPE_CURRENCY_TRIPLE:
97 case TYPE_CURRENCY_QUAD:
98 case TYPE_CURRENCY_QUINT:
99 // All currency symbols use the same matcher
100 addMatcher(fWarehouse.currency(status));
101 break;
102 default:
3d1f044b 103 UPRV_UNREACHABLE;
0f5d89e8
A
104 }
105
106 } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) {
107 // Case 2: the token is an ignorable literal.
108 // No action necessary: the ignorables matcher has already been added.
109
110 } else {
111 // Case 3: the token is a non-ignorable literal.
3d1f044b
A
112 if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) {
113 addMatcher(*ptr);
114 } else {
115 // OOM; unwind the stack
116 return;
117 }
0f5d89e8
A
118 }
119 fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp;
120}
121
122void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) {
123 if (fMatchersLen >= fMatchers.getCapacity()) {
124 fMatchers.resize(fMatchersLen * 2, fMatchersLen);
125 }
126 fMatchers[fMatchersLen++] = &matcher;
127}
128
129AffixPatternMatcher AffixPatternMatcherBuilder::build() {
130 return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern);
131}
132
0f5d89e8
A
133AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData)
134 : fSetupData(setupData) {}
135
136NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
137 return fMinusSign = {fSetupData->dfs, true};
138}
139
140NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
141 return fPlusSign = {fSetupData->dfs, true};
142}
143
144NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
145 return fPercent = {fSetupData->dfs};
146}
147
148NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
149 return fPermille = {fSetupData->dfs};
150}
151
152NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
153 return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status};
154}
155
156IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() {
157 return fSetupData->ignorables;
158}
159
3d1f044b
A
160NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) {
161 if (U_FAILURE(status)) {
162 return nullptr;
163 }
164 auto* result = fCodePoints.create(cp);
165 if (result == nullptr) {
166 status = U_MEMORY_ALLOCATION_ERROR;
167 }
168 return result;
0f5d89e8
A
169}
170
171
172CodePointMatcher::CodePointMatcher(UChar32 cp)
173 : fCp(cp) {}
174
175bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
176 if (segment.startsWith(fCp)) {
177 segment.adjustOffsetByCodePoint();
178 result.setCharsConsumed(segment);
179 }
180 return false;
181}
182
183bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
184 return segment.startsWith(fCp);
185}
186
187UnicodeString CodePointMatcher::toString() const {
188 return u"<CodePoint>";
189}
190
191
192AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern,
193 AffixTokenMatcherWarehouse& tokenWarehouse,
194 parse_flags_t parseFlags, bool* success,
195 UErrorCode& status) {
196 if (affixPattern.isEmpty()) {
197 *success = false;
198 return {};
199 }
200 *success = true;
201
202 IgnorablesMatcher* ignorables;
203 if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) {
204 ignorables = nullptr;
205 } else {
206 ignorables = &tokenWarehouse.ignorables();
207 }
208
209 AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables);
210 AffixUtils::iterateWithConsumer(affixPattern, builder, status);
211 return builder.build();
212}
213
214AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen,
215 const UnicodeString& pattern)
216 : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern) {}
217
218UnicodeString AffixPatternMatcher::getPattern() const {
219 return fPattern.toAliasedUnicodeString();
220}
221
222bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const {
223 return fPattern == other.fPattern;
224}
225
226
227AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse)
228 : fTokenWarehouse(tokenWarehouse) {
229}
230
231bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo,
232 const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
233 UErrorCode& status) {
234 UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX);
235 UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX);
236 UnicodeString negPrefixString;
237 UnicodeString negSuffixString;
238 if (patternInfo.hasNegativeSubpattern()) {
239 negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX);
240 negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX);
241 }
242
243 if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) &&
244 AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) &&
245 AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) &&
246 AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) &&
247 AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status)
248 // HACK: Plus and minus sign are a special case: we accept them trailing only if they are
249 // trailing in the pattern string.
250 && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) &&
251 !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) &&
252 !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) &&
253 !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) {
254 // The affixes contain only symbols and ignorables.
255 // No need to generate affix matchers.
256 return false;
257 }
258 return true;
259}
260
261void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
262 MutableMatcherCollection& output,
263 const IgnorablesMatcher& ignorables,
264 parse_flags_t parseFlags, UErrorCode& status) {
265 if (!isInteresting(patternInfo, ignorables, parseFlags, status)) {
266 return;
267 }
268
269 // The affixes have interesting characters, or we are in strict mode.
270 // Use initial capacity of 6, the highest possible number of AffixMatchers.
271 UnicodeString sb;
272 bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
273 UNumberSignDisplay signDisplay = (0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) ? UNUM_SIGN_ALWAYS
274 : UNUM_SIGN_AUTO;
275
276 int32_t numAffixMatchers = 0;
277 int32_t numAffixPatternMatchers = 0;
278
279 AffixPatternMatcher* posPrefix = nullptr;
280 AffixPatternMatcher* posSuffix = nullptr;
281
282 // Pre-process the affix strings to resolve LDML rules like sign display.
283 for (int8_t signum = 1; signum >= -1; signum--) {
284 // Generate Prefix
285 bool hasPrefix = false;
286 PatternStringUtils::patternInfoToStringBuilder(
287 patternInfo, true, signum, signDisplay, StandardPlural::OTHER, false, sb);
288 fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
289 sb, *fTokenWarehouse, parseFlags, &hasPrefix, status);
290 AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
291 : nullptr;
292
293 // Generate Suffix
294 bool hasSuffix = false;
295 PatternStringUtils::patternInfoToStringBuilder(
296 patternInfo, false, signum, signDisplay, StandardPlural::OTHER, false, sb);
297 fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
298 sb, *fTokenWarehouse, parseFlags, &hasSuffix, status);
299 AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
300 : nullptr;
301
302 if (signum == 1) {
303 posPrefix = prefix;
304 posSuffix = suffix;
305 } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) {
306 // Skip adding these matchers (we already have equivalents)
307 continue;
308 }
309
310 // Flags for setting in the ParsedNumber; the token matchers may add more.
311 int flags = (signum == -1) ? FLAG_NEGATIVE : 0;
312
313 // Note: it is indeed possible for posPrefix and posSuffix to both be null.
314 // We still need to add that matcher for strict mode to work.
315 fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
316 if (includeUnpaired && prefix != nullptr && suffix != nullptr) {
317 // The following if statements are designed to prevent adding two identical matchers.
318 if (signum == 1 || !equals(prefix, posPrefix)) {
319 fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
320 }
321 if (signum == 1 || !equals(suffix, posSuffix)) {
322 fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
323 }
324 }
325 }
326
327 // Put the AffixMatchers in order, and then add them to the output.
328 // Since there are at most 9 elements, do a simple-to-implement bubble sort.
329 bool madeChanges;
330 do {
331 madeChanges = false;
332 for (int32_t i = 1; i < numAffixMatchers; i++) {
333 if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) {
334 madeChanges = true;
335 AffixMatcher temp = std::move(fAffixMatchers[i - 1]);
336 fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]);
337 fAffixMatchers[i] = std::move(temp);
338 }
339 }
340 } while (madeChanges);
341
342 for (int32_t i = 0; i < numAffixMatchers; i++) {
343 // Enable the following line to debug affixes
344 //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
345 output.addMatcher(fAffixMatchers[i]);
346 }
347}
348
349
350AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags)
351 : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {}
352
353bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
3d1f044b 354 bool startCurrencyIsEmpty = (result.currencyCode[0]==0); // Apple fix for <rdar://problem/46915356>
0f5d89e8
A
355 if (!result.seenNumber()) {
356 // Prefix
357 // Do not match if:
358 // 1. We have already seen a prefix (result.prefix != null)
359 // 2. The prefix in this AffixMatcher is empty (prefix == null)
360 if (!result.prefix.isBogus() || fPrefix == nullptr) {
361 return false;
362 }
363
364 // Attempt to match the prefix.
365 int initialOffset = segment.getOffset();
366 bool maybeMore = fPrefix->match(segment, result, status);
3d1f044b
A
367 if (initialOffset != segment.getOffset()
368 || (startCurrencyIsEmpty && result.currencyCode[0]!=0)) { // Apple fix for <rdar://problem/46915356>
0f5d89e8
A
369 result.prefix = fPrefix->getPattern();
370 }
371 return maybeMore;
372
373 } else {
374 // Suffix
375 // Do not match if:
376 // 1. We have already seen a suffix (result.suffix != null)
377 // 2. The suffix in this AffixMatcher is empty (suffix == null)
378 // 3. The matched prefix does not equal this AffixMatcher's prefix
379 if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) {
380 return false;
381 }
382
383 // Attempt to match the suffix.
384 int initialOffset = segment.getOffset();
385 bool maybeMore = fSuffix->match(segment, result, status);
3d1f044b
A
386 if (initialOffset != segment.getOffset()
387 || (startCurrencyIsEmpty && result.currencyCode[0]!=0)) { // Apple fix for <rdar://problem/46915356>
0f5d89e8
A
388 result.suffix = fSuffix->getPattern();
389 }
390 return maybeMore;
391 }
392}
393
394bool AffixMatcher::smokeTest(const StringSegment& segment) const {
395 return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
396 (fSuffix != nullptr && fSuffix->smokeTest(segment));
397}
398
399void AffixMatcher::postProcess(ParsedNumber& result) const {
400 // Check to see if our affix is the one that was matched. If so, set the flags in the result.
401 if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) {
402 // Fill in the result prefix and suffix with non-null values (empty string).
403 // Used by strict mode to determine whether an entire affix pair was matched.
404 if (result.prefix.isBogus()) {
405 result.prefix = UnicodeString();
406 }
407 if (result.suffix.isBogus()) {
408 result.suffix = UnicodeString();
409 }
410 result.flags |= fFlags;
411 if (fPrefix != nullptr) {
412 fPrefix->postProcess(result);
413 }
414 if (fSuffix != nullptr) {
415 fSuffix->postProcess(result);
416 }
417 }
418}
419
420int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const {
421 const AffixMatcher& lhs = *this;
422 if (length(lhs.fPrefix) != length(rhs.fPrefix)) {
423 return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1;
424 } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) {
425 return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1;
426 } else {
427 return 0;
428 }
429}
430
431UnicodeString AffixMatcher::toString() const {
432 bool isNegative = 0 != (fFlags & FLAG_NEGATIVE);
433 return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") +
434 (fPrefix ? fPrefix->getPattern() : u"null") + u"#" +
435 (fSuffix ? fSuffix->getPattern() : u"null") + u">";
436
437}
438
439
440#endif /* #if !UCONFIG_NO_FORMATTING */
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465