]>
Commit | Line | Data |
---|---|---|
0f5d89e8 A |
1 | // © 2018 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | #include "unicode/utypes.h" | |
5 | ||
6 | #if !UCONFIG_NO_FORMATTING | |
7 | ||
8 | // Allow implicit conversion from char16_t* to UnicodeString for this file: | |
9 | // Helpful in toString methods and elsewhere. | |
10 | #define UNISTR_FROM_STRING_EXPLICIT | |
11 | ||
12 | #include "numparse_types.h" | |
13 | #include "numparse_affixes.h" | |
14 | #include "numparse_utils.h" | |
15 | #include "number_utils.h" | |
16 | ||
17 | using namespace icu; | |
18 | using namespace icu::numparse; | |
19 | using namespace icu::numparse::impl; | |
20 | using namespace icu::number; | |
21 | using namespace icu::number::impl; | |
22 | ||
23 | ||
24 | namespace { | |
25 | ||
26 | /** | |
27 | * Helper method to return whether the given AffixPatternMatcher equals the given pattern string. | |
28 | * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal | |
29 | * the given pattern string. | |
30 | */ | |
31 | static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) { | |
32 | return (affix == nullptr && patternString.isBogus()) || | |
33 | (affix != nullptr && affix->getPattern() == patternString); | |
34 | } | |
35 | ||
36 | /** | |
37 | * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null. | |
38 | */ | |
39 | static int32_t length(const AffixPatternMatcher* matcher) { | |
40 | return matcher == nullptr ? 0 : matcher->getPattern().length(); | |
41 | } | |
42 | ||
43 | /** | |
44 | * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both | |
45 | * valid, whether they are equal according to operator==. Similar to Java Objects.equals() | |
46 | */ | |
47 | static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) { | |
48 | if (lhs == nullptr && rhs == nullptr) { | |
49 | return true; | |
50 | } | |
51 | if (lhs == nullptr || rhs == nullptr) { | |
52 | return false; | |
53 | } | |
54 | return *lhs == *rhs; | |
55 | } | |
56 | ||
57 | } | |
58 | ||
59 | ||
60 | AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern, | |
61 | AffixTokenMatcherWarehouse& warehouse, | |
62 | IgnorablesMatcher* ignorables) | |
63 | : fMatchersLen(0), | |
64 | fLastTypeOrCp(0), | |
65 | fPattern(pattern), | |
66 | fWarehouse(warehouse), | |
67 | fIgnorables(ignorables) {} | |
68 | ||
69 | void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) { | |
70 | // This is called by AffixUtils.iterateWithConsumer() for each token. | |
71 | ||
72 | // Add an ignorables matcher between tokens except between two literals, and don't put two | |
73 | // ignorables matchers in a row. | |
74 | if (fIgnorables != nullptr && fMatchersLen > 0 && | |
75 | (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) { | |
76 | addMatcher(*fIgnorables); | |
77 | } | |
78 | ||
79 | if (type != TYPE_CODEPOINT) { | |
80 | // Case 1: the token is a symbol. | |
81 | switch (type) { | |
82 | case TYPE_MINUS_SIGN: | |
83 | addMatcher(fWarehouse.minusSign()); | |
84 | break; | |
85 | case TYPE_PLUS_SIGN: | |
86 | addMatcher(fWarehouse.plusSign()); | |
87 | break; | |
88 | case TYPE_PERCENT: | |
89 | addMatcher(fWarehouse.percent()); | |
90 | break; | |
91 | case TYPE_PERMILLE: | |
92 | addMatcher(fWarehouse.permille()); | |
93 | break; | |
94 | case TYPE_CURRENCY_SINGLE: | |
95 | case TYPE_CURRENCY_DOUBLE: | |
96 | case TYPE_CURRENCY_TRIPLE: | |
97 | case TYPE_CURRENCY_QUAD: | |
98 | case TYPE_CURRENCY_QUINT: | |
99 | // All currency symbols use the same matcher | |
100 | addMatcher(fWarehouse.currency(status)); | |
101 | break; | |
102 | default: | |
3d1f044b | 103 | UPRV_UNREACHABLE; |
0f5d89e8 A |
104 | } |
105 | ||
106 | } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) { | |
107 | // Case 2: the token is an ignorable literal. | |
108 | // No action necessary: the ignorables matcher has already been added. | |
109 | ||
110 | } else { | |
111 | // Case 3: the token is a non-ignorable literal. | |
3d1f044b A |
112 | if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) { |
113 | addMatcher(*ptr); | |
114 | } else { | |
115 | // OOM; unwind the stack | |
116 | return; | |
117 | } | |
0f5d89e8 A |
118 | } |
119 | fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp; | |
120 | } | |
121 | ||
122 | void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) { | |
123 | if (fMatchersLen >= fMatchers.getCapacity()) { | |
124 | fMatchers.resize(fMatchersLen * 2, fMatchersLen); | |
125 | } | |
126 | fMatchers[fMatchersLen++] = &matcher; | |
127 | } | |
128 | ||
129 | AffixPatternMatcher AffixPatternMatcherBuilder::build() { | |
130 | return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern); | |
131 | } | |
132 | ||
0f5d89e8 A |
133 | AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData) |
134 | : fSetupData(setupData) {} | |
135 | ||
136 | NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() { | |
137 | return fMinusSign = {fSetupData->dfs, true}; | |
138 | } | |
139 | ||
140 | NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() { | |
141 | return fPlusSign = {fSetupData->dfs, true}; | |
142 | } | |
143 | ||
144 | NumberParseMatcher& AffixTokenMatcherWarehouse::percent() { | |
145 | return fPercent = {fSetupData->dfs}; | |
146 | } | |
147 | ||
148 | NumberParseMatcher& AffixTokenMatcherWarehouse::permille() { | |
149 | return fPermille = {fSetupData->dfs}; | |
150 | } | |
151 | ||
152 | NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) { | |
153 | return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status}; | |
154 | } | |
155 | ||
156 | IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() { | |
157 | return fSetupData->ignorables; | |
158 | } | |
159 | ||
3d1f044b A |
160 | NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) { |
161 | if (U_FAILURE(status)) { | |
162 | return nullptr; | |
163 | } | |
164 | auto* result = fCodePoints.create(cp); | |
165 | if (result == nullptr) { | |
166 | status = U_MEMORY_ALLOCATION_ERROR; | |
167 | } | |
168 | return result; | |
0f5d89e8 A |
169 | } |
170 | ||
171 | ||
172 | CodePointMatcher::CodePointMatcher(UChar32 cp) | |
173 | : fCp(cp) {} | |
174 | ||
175 | bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const { | |
176 | if (segment.startsWith(fCp)) { | |
177 | segment.adjustOffsetByCodePoint(); | |
178 | result.setCharsConsumed(segment); | |
179 | } | |
180 | return false; | |
181 | } | |
182 | ||
183 | bool CodePointMatcher::smokeTest(const StringSegment& segment) const { | |
184 | return segment.startsWith(fCp); | |
185 | } | |
186 | ||
187 | UnicodeString CodePointMatcher::toString() const { | |
188 | return u"<CodePoint>"; | |
189 | } | |
190 | ||
191 | ||
192 | AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern, | |
193 | AffixTokenMatcherWarehouse& tokenWarehouse, | |
194 | parse_flags_t parseFlags, bool* success, | |
195 | UErrorCode& status) { | |
196 | if (affixPattern.isEmpty()) { | |
197 | *success = false; | |
198 | return {}; | |
199 | } | |
200 | *success = true; | |
201 | ||
202 | IgnorablesMatcher* ignorables; | |
203 | if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) { | |
204 | ignorables = nullptr; | |
205 | } else { | |
206 | ignorables = &tokenWarehouse.ignorables(); | |
207 | } | |
208 | ||
209 | AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables); | |
210 | AffixUtils::iterateWithConsumer(affixPattern, builder, status); | |
211 | return builder.build(); | |
212 | } | |
213 | ||
214 | AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen, | |
215 | const UnicodeString& pattern) | |
216 | : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern) {} | |
217 | ||
218 | UnicodeString AffixPatternMatcher::getPattern() const { | |
219 | return fPattern.toAliasedUnicodeString(); | |
220 | } | |
221 | ||
222 | bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const { | |
223 | return fPattern == other.fPattern; | |
224 | } | |
225 | ||
226 | ||
227 | AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse) | |
228 | : fTokenWarehouse(tokenWarehouse) { | |
229 | } | |
230 | ||
231 | bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo, | |
232 | const IgnorablesMatcher& ignorables, parse_flags_t parseFlags, | |
233 | UErrorCode& status) { | |
234 | UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX); | |
235 | UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX); | |
236 | UnicodeString negPrefixString; | |
237 | UnicodeString negSuffixString; | |
238 | if (patternInfo.hasNegativeSubpattern()) { | |
239 | negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX); | |
240 | negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX); | |
241 | } | |
242 | ||
243 | if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) && | |
244 | AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) && | |
245 | AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) && | |
246 | AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) && | |
247 | AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status) | |
248 | // HACK: Plus and minus sign are a special case: we accept them trailing only if they are | |
249 | // trailing in the pattern string. | |
250 | && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) && | |
251 | !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) && | |
252 | !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) && | |
253 | !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) { | |
254 | // The affixes contain only symbols and ignorables. | |
255 | // No need to generate affix matchers. | |
256 | return false; | |
257 | } | |
258 | return true; | |
259 | } | |
260 | ||
261 | void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo, | |
262 | MutableMatcherCollection& output, | |
263 | const IgnorablesMatcher& ignorables, | |
264 | parse_flags_t parseFlags, UErrorCode& status) { | |
265 | if (!isInteresting(patternInfo, ignorables, parseFlags, status)) { | |
266 | return; | |
267 | } | |
268 | ||
269 | // The affixes have interesting characters, or we are in strict mode. | |
270 | // Use initial capacity of 6, the highest possible number of AffixMatchers. | |
271 | UnicodeString sb; | |
272 | bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES); | |
273 | UNumberSignDisplay signDisplay = (0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) ? UNUM_SIGN_ALWAYS | |
274 | : UNUM_SIGN_AUTO; | |
275 | ||
276 | int32_t numAffixMatchers = 0; | |
277 | int32_t numAffixPatternMatchers = 0; | |
278 | ||
279 | AffixPatternMatcher* posPrefix = nullptr; | |
280 | AffixPatternMatcher* posSuffix = nullptr; | |
281 | ||
282 | // Pre-process the affix strings to resolve LDML rules like sign display. | |
283 | for (int8_t signum = 1; signum >= -1; signum--) { | |
284 | // Generate Prefix | |
285 | bool hasPrefix = false; | |
286 | PatternStringUtils::patternInfoToStringBuilder( | |
287 | patternInfo, true, signum, signDisplay, StandardPlural::OTHER, false, sb); | |
288 | fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( | |
289 | sb, *fTokenWarehouse, parseFlags, &hasPrefix, status); | |
290 | AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++] | |
291 | : nullptr; | |
292 | ||
293 | // Generate Suffix | |
294 | bool hasSuffix = false; | |
295 | PatternStringUtils::patternInfoToStringBuilder( | |
296 | patternInfo, false, signum, signDisplay, StandardPlural::OTHER, false, sb); | |
297 | fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( | |
298 | sb, *fTokenWarehouse, parseFlags, &hasSuffix, status); | |
299 | AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++] | |
300 | : nullptr; | |
301 | ||
302 | if (signum == 1) { | |
303 | posPrefix = prefix; | |
304 | posSuffix = suffix; | |
305 | } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) { | |
306 | // Skip adding these matchers (we already have equivalents) | |
307 | continue; | |
308 | } | |
309 | ||
310 | // Flags for setting in the ParsedNumber; the token matchers may add more. | |
311 | int flags = (signum == -1) ? FLAG_NEGATIVE : 0; | |
312 | ||
313 | // Note: it is indeed possible for posPrefix and posSuffix to both be null. | |
314 | // We still need to add that matcher for strict mode to work. | |
315 | fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags}; | |
316 | if (includeUnpaired && prefix != nullptr && suffix != nullptr) { | |
317 | // The following if statements are designed to prevent adding two identical matchers. | |
318 | if (signum == 1 || !equals(prefix, posPrefix)) { | |
319 | fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags}; | |
320 | } | |
321 | if (signum == 1 || !equals(suffix, posSuffix)) { | |
322 | fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags}; | |
323 | } | |
324 | } | |
325 | } | |
326 | ||
327 | // Put the AffixMatchers in order, and then add them to the output. | |
328 | // Since there are at most 9 elements, do a simple-to-implement bubble sort. | |
329 | bool madeChanges; | |
330 | do { | |
331 | madeChanges = false; | |
332 | for (int32_t i = 1; i < numAffixMatchers; i++) { | |
333 | if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) { | |
334 | madeChanges = true; | |
335 | AffixMatcher temp = std::move(fAffixMatchers[i - 1]); | |
336 | fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]); | |
337 | fAffixMatchers[i] = std::move(temp); | |
338 | } | |
339 | } | |
340 | } while (madeChanges); | |
341 | ||
342 | for (int32_t i = 0; i < numAffixMatchers; i++) { | |
343 | // Enable the following line to debug affixes | |
344 | //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl; | |
345 | output.addMatcher(fAffixMatchers[i]); | |
346 | } | |
347 | } | |
348 | ||
349 | ||
350 | AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags) | |
351 | : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {} | |
352 | ||
353 | bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { | |
3d1f044b | 354 | bool startCurrencyIsEmpty = (result.currencyCode[0]==0); // Apple fix for <rdar://problem/46915356> |
0f5d89e8 A |
355 | if (!result.seenNumber()) { |
356 | // Prefix | |
357 | // Do not match if: | |
358 | // 1. We have already seen a prefix (result.prefix != null) | |
359 | // 2. The prefix in this AffixMatcher is empty (prefix == null) | |
360 | if (!result.prefix.isBogus() || fPrefix == nullptr) { | |
361 | return false; | |
362 | } | |
363 | ||
364 | // Attempt to match the prefix. | |
365 | int initialOffset = segment.getOffset(); | |
366 | bool maybeMore = fPrefix->match(segment, result, status); | |
3d1f044b A |
367 | if (initialOffset != segment.getOffset() |
368 | || (startCurrencyIsEmpty && result.currencyCode[0]!=0)) { // Apple fix for <rdar://problem/46915356> | |
0f5d89e8 A |
369 | result.prefix = fPrefix->getPattern(); |
370 | } | |
371 | return maybeMore; | |
372 | ||
373 | } else { | |
374 | // Suffix | |
375 | // Do not match if: | |
376 | // 1. We have already seen a suffix (result.suffix != null) | |
377 | // 2. The suffix in this AffixMatcher is empty (suffix == null) | |
378 | // 3. The matched prefix does not equal this AffixMatcher's prefix | |
379 | if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) { | |
380 | return false; | |
381 | } | |
382 | ||
383 | // Attempt to match the suffix. | |
384 | int initialOffset = segment.getOffset(); | |
385 | bool maybeMore = fSuffix->match(segment, result, status); | |
3d1f044b A |
386 | if (initialOffset != segment.getOffset() |
387 | || (startCurrencyIsEmpty && result.currencyCode[0]!=0)) { // Apple fix for <rdar://problem/46915356> | |
0f5d89e8 A |
388 | result.suffix = fSuffix->getPattern(); |
389 | } | |
390 | return maybeMore; | |
391 | } | |
392 | } | |
393 | ||
394 | bool AffixMatcher::smokeTest(const StringSegment& segment) const { | |
395 | return (fPrefix != nullptr && fPrefix->smokeTest(segment)) || | |
396 | (fSuffix != nullptr && fSuffix->smokeTest(segment)); | |
397 | } | |
398 | ||
399 | void AffixMatcher::postProcess(ParsedNumber& result) const { | |
400 | // Check to see if our affix is the one that was matched. If so, set the flags in the result. | |
401 | if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) { | |
402 | // Fill in the result prefix and suffix with non-null values (empty string). | |
403 | // Used by strict mode to determine whether an entire affix pair was matched. | |
404 | if (result.prefix.isBogus()) { | |
405 | result.prefix = UnicodeString(); | |
406 | } | |
407 | if (result.suffix.isBogus()) { | |
408 | result.suffix = UnicodeString(); | |
409 | } | |
410 | result.flags |= fFlags; | |
411 | if (fPrefix != nullptr) { | |
412 | fPrefix->postProcess(result); | |
413 | } | |
414 | if (fSuffix != nullptr) { | |
415 | fSuffix->postProcess(result); | |
416 | } | |
417 | } | |
418 | } | |
419 | ||
420 | int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const { | |
421 | const AffixMatcher& lhs = *this; | |
422 | if (length(lhs.fPrefix) != length(rhs.fPrefix)) { | |
423 | return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1; | |
424 | } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) { | |
425 | return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1; | |
426 | } else { | |
427 | return 0; | |
428 | } | |
429 | } | |
430 | ||
431 | UnicodeString AffixMatcher::toString() const { | |
432 | bool isNegative = 0 != (fFlags & FLAG_NEGATIVE); | |
433 | return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") + | |
434 | (fPrefix ? fPrefix->getPattern() : u"null") + u"#" + | |
435 | (fSuffix ? fSuffix->getPattern() : u"null") + u">"; | |
436 | ||
437 | } | |
438 | ||
439 | ||
440 | #endif /* #if !UCONFIG_NO_FORMATTING */ | |
441 | ||
442 | ||
443 | ||
444 | ||
445 | ||
446 | ||
447 | ||
448 | ||
449 | ||
450 | ||
451 | ||
452 | ||
453 | ||
454 | ||
455 | ||
456 | ||
457 | ||
458 | ||
459 | ||
460 | ||
461 | ||
462 | ||
463 | ||
464 | ||
465 |