]>
Commit | Line | Data |
---|---|---|
0f5d89e8 A |
1 | // © 2018 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | #include "unicode/utypes.h" | |
5 | ||
6 | #if !UCONFIG_NO_FORMATTING | |
7 | #ifndef __NUMPARSE_TYPES_H__ | |
8 | #define __NUMPARSE_TYPES_H__ | |
9 | ||
10 | #include "unicode/uobject.h" | |
11 | #include "number_decimalquantity.h" | |
12 | ||
13 | U_NAMESPACE_BEGIN namespace numparse { | |
14 | namespace impl { | |
15 | ||
16 | // Forward-declarations | |
17 | class StringSegment; | |
18 | class ParsedNumber; | |
19 | ||
20 | typedef int32_t result_flags_t; | |
21 | typedef int32_t parse_flags_t; | |
22 | ||
23 | /** Flags for the type result_flags_t */ | |
24 | enum ResultFlags { | |
25 | FLAG_NEGATIVE = 0x0001, | |
26 | FLAG_PERCENT = 0x0002, | |
27 | FLAG_PERMILLE = 0x0004, | |
28 | FLAG_HAS_EXPONENT = 0x0008, | |
29 | // FLAG_HAS_DEFAULT_CURRENCY = 0x0010, // no longer used | |
30 | FLAG_HAS_DECIMAL_SEPARATOR = 0x0020, | |
31 | FLAG_NAN = 0x0040, | |
32 | FLAG_INFINITY = 0x0080, | |
33 | FLAG_FAIL = 0x0100, | |
34 | }; | |
35 | ||
36 | /** Flags for the type parse_flags_t */ | |
37 | enum ParseFlags { | |
38 | PARSE_FLAG_IGNORE_CASE = 0x0001, | |
39 | PARSE_FLAG_MONETARY_SEPARATORS = 0x0002, | |
40 | PARSE_FLAG_STRICT_SEPARATORS = 0x0004, | |
41 | PARSE_FLAG_STRICT_GROUPING_SIZE = 0x0008, | |
42 | PARSE_FLAG_INTEGER_ONLY = 0x0010, | |
43 | PARSE_FLAG_GROUPING_DISABLED = 0x0020, | |
44 | // PARSE_FLAG_FRACTION_GROUPING_ENABLED = 0x0040, // see #10794 | |
45 | PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES = 0x0080, | |
46 | PARSE_FLAG_USE_FULL_AFFIXES = 0x0100, | |
47 | PARSE_FLAG_EXACT_AFFIX = 0x0200, | |
48 | PARSE_FLAG_PLUS_SIGN_ALLOWED = 0x0400, | |
49 | // PARSE_FLAG_OPTIMIZE = 0x0800, // no longer used | |
50 | // PARSE_FLAG_FORCE_BIG_DECIMAL = 0x1000, // not used in ICU4C | |
51 | PARSE_FLAG_NO_FOREIGN_CURRENCY = 0x2000, | |
3d1f044b | 52 | PARSE_FLAG_ALLOW_INFINITE_RECURSION = 0x4000, |
c5116b9f | 53 | PARSE_FLAG_HAS_TRAIL_CURRENCY = 0x0800, // Apple <rdar://problem/51938595> |
0f5d89e8 A |
54 | }; |
55 | ||
56 | ||
57 | // TODO: Is this class worthwhile? | |
58 | template<int32_t stackCapacity> | |
59 | class CompactUnicodeString { | |
60 | public: | |
61 | CompactUnicodeString() { | |
62 | static_assert(stackCapacity > 0, "cannot have zero space on stack"); | |
63 | fBuffer[0] = 0; | |
64 | } | |
65 | ||
66 | CompactUnicodeString(const UnicodeString& text) | |
67 | : fBuffer(text.length() + 1) { | |
3d1f044b | 68 | uprv_memcpy(fBuffer.getAlias(), text.getBuffer(), sizeof(UChar) * text.length()); |
0f5d89e8 A |
69 | fBuffer[text.length()] = 0; |
70 | } | |
71 | ||
72 | inline UnicodeString toAliasedUnicodeString() const { | |
73 | return UnicodeString(TRUE, fBuffer.getAlias(), -1); | |
74 | } | |
75 | ||
76 | bool operator==(const CompactUnicodeString& other) const { | |
77 | // Use the alias-only constructor and then call UnicodeString operator== | |
78 | return toAliasedUnicodeString() == other.toAliasedUnicodeString(); | |
79 | } | |
80 | ||
81 | private: | |
82 | MaybeStackArray<UChar, stackCapacity> fBuffer; | |
83 | }; | |
84 | ||
85 | ||
86 | /** | |
87 | * Struct-like class to hold the results of a parsing routine. | |
88 | * | |
89 | * @author sffc | |
90 | */ | |
91 | // Exported as U_I18N_API for tests | |
92 | class U_I18N_API ParsedNumber { | |
93 | public: | |
94 | ||
95 | /** | |
96 | * The numerical value that was parsed. | |
97 | */ | |
98 | ::icu::number::impl::DecimalQuantity quantity; | |
99 | ||
100 | /** | |
101 | * The index of the last char consumed during parsing. If parsing started at index 0, this is equal | |
102 | * to the number of chars consumed. This is NOT necessarily the same as the StringSegment offset; | |
103 | * "weak" chars, like whitespace, change the offset, but the charsConsumed is not touched until a | |
104 | * "strong" char is encountered. | |
105 | */ | |
106 | int32_t charEnd; | |
107 | ||
108 | /** | |
109 | * Boolean flags (see constants above). | |
110 | */ | |
111 | result_flags_t flags; | |
112 | ||
113 | /** | |
114 | * The pattern string corresponding to the prefix that got consumed. | |
115 | */ | |
116 | UnicodeString prefix; | |
117 | ||
118 | /** | |
119 | * The pattern string corresponding to the suffix that got consumed. | |
120 | */ | |
121 | UnicodeString suffix; | |
122 | ||
123 | /** | |
124 | * The currency that got consumed. | |
125 | */ | |
126 | UChar currencyCode[4]; | |
127 | ||
128 | ParsedNumber(); | |
129 | ||
130 | ParsedNumber(const ParsedNumber& other) = default; | |
131 | ||
132 | ParsedNumber& operator=(const ParsedNumber& other) = default; | |
133 | ||
134 | void clear(); | |
135 | ||
136 | /** | |
137 | * Call this method to register that a "strong" char was consumed. This should be done after calling | |
138 | * {@link StringSegment#setOffset} or {@link StringSegment#adjustOffset} except when the char is | |
139 | * "weak", like whitespace. | |
140 | * | |
141 | * <p> | |
142 | * <strong>What is a strong versus weak char?</strong> The behavior of number parsing is to "stop" | |
143 | * after reading the number, even if there is other content following the number. For example, after | |
144 | * parsing the string "123 " (123 followed by a space), the cursor should be set to 3, not 4, even | |
145 | * though there are matchers that accept whitespace. In this example, the digits are strong, whereas | |
146 | * the whitespace is weak. Grouping separators are weak, whereas decimal separators are strong. Most | |
147 | * other chars are strong. | |
148 | * | |
149 | * @param segment | |
150 | * The current StringSegment, usually immediately following a call to setOffset. | |
151 | */ | |
152 | void setCharsConsumed(const StringSegment& segment); | |
153 | ||
154 | /** Apply certain number-related flags to the DecimalQuantity. */ | |
155 | void postProcess(); | |
156 | ||
157 | /** | |
158 | * Returns whether this the parse was successful. To be successful, at least one char must have been | |
159 | * consumed, and the failure flag must not be set. | |
160 | */ | |
161 | bool success() const; | |
162 | ||
163 | bool seenNumber() const; | |
164 | ||
3d1f044b | 165 | double getDouble(UErrorCode& status) const; |
0f5d89e8 A |
166 | |
167 | void populateFormattable(Formattable& output, parse_flags_t parseFlags) const; | |
168 | ||
169 | bool isBetterThan(const ParsedNumber& other); | |
170 | }; | |
171 | ||
172 | ||
173 | /** | |
174 | * A mutable class allowing for a String with a variable offset and length. The charAt, length, and | |
175 | * subSequence methods all operate relative to the fixed offset into the String. | |
176 | * | |
177 | * @author sffc | |
178 | */ | |
179 | // Exported as U_I18N_API for tests | |
180 | class U_I18N_API StringSegment : public UMemory { | |
181 | public: | |
182 | StringSegment(const UnicodeString& str, bool ignoreCase); | |
183 | ||
184 | int32_t getOffset() const; | |
185 | ||
186 | void setOffset(int32_t start); | |
187 | ||
188 | /** | |
189 | * Equivalent to <code>setOffset(getOffset()+delta)</code>. | |
190 | * | |
191 | * <p> | |
192 | * This method is usually called by a Matcher to register that a char was consumed. If the char is | |
193 | * strong (it usually is, except for things like whitespace), follow this with a call to | |
194 | * {@link ParsedNumber#setCharsConsumed}. For more information on strong chars, see that method. | |
195 | */ | |
196 | void adjustOffset(int32_t delta); | |
197 | ||
198 | /** | |
199 | * Adjusts the offset by the width of the current code point, either 1 or 2 chars. | |
200 | */ | |
201 | void adjustOffsetByCodePoint(); | |
202 | ||
203 | void setLength(int32_t length); | |
204 | ||
205 | void resetLength(); | |
206 | ||
207 | int32_t length() const; | |
208 | ||
209 | char16_t charAt(int32_t index) const; | |
210 | ||
211 | UChar32 codePointAt(int32_t index) const; | |
212 | ||
213 | UnicodeString toUnicodeString() const; | |
214 | ||
215 | const UnicodeString toTempUnicodeString() const; | |
216 | ||
217 | /** | |
218 | * Returns the first code point in the string segment, or -1 if the string starts with an invalid | |
219 | * code point. | |
220 | * | |
221 | * <p> | |
222 | * <strong>Important:</strong> Most of the time, you should use {@link #matches}, which handles case | |
223 | * folding logic, instead of this method. | |
224 | */ | |
225 | UChar32 getCodePoint() const; | |
226 | ||
227 | /** | |
228 | * Returns true if the first code point of this StringSegment equals the given code point. | |
229 | * | |
230 | * <p> | |
231 | * This method will perform case folding if case folding is enabled for the parser. | |
232 | */ | |
233 | bool startsWith(UChar32 otherCp) const; | |
234 | ||
235 | /** | |
236 | * Returns true if the first code point of this StringSegment is in the given UnicodeSet. | |
237 | */ | |
238 | bool startsWith(const UnicodeSet& uniset) const; | |
239 | ||
240 | /** | |
241 | * Returns true if there is at least one code point of overlap between this StringSegment and the | |
242 | * given UnicodeString. | |
243 | */ | |
244 | bool startsWith(const UnicodeString& other) const; | |
245 | ||
246 | /** | |
247 | * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For | |
248 | * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2, | |
249 | * since the first 2 characters are the same. | |
250 | * | |
251 | * <p> | |
252 | * This method only returns offsets along code point boundaries. | |
253 | * | |
254 | * <p> | |
255 | * This method will perform case folding if case folding was enabled in the constructor. | |
256 | * | |
257 | * <p> | |
258 | * IMPORTANT: The given UnicodeString must not be empty! It is the caller's responsibility to check. | |
259 | */ | |
260 | int32_t getCommonPrefixLength(const UnicodeString& other); | |
261 | ||
262 | /** | |
263 | * Like {@link #getCommonPrefixLength}, but never performs case folding, even if case folding is | |
264 | * enabled for the parser. | |
265 | */ | |
266 | int32_t getCaseSensitivePrefixLength(const UnicodeString& other); | |
267 | ||
268 | bool operator==(const UnicodeString& other) const; | |
269 | ||
270 | private: | |
3d1f044b | 271 | const UnicodeString& fStr; |
0f5d89e8 A |
272 | int32_t fStart; |
273 | int32_t fEnd; | |
274 | bool fFoldCase; | |
275 | ||
276 | int32_t getPrefixLengthInternal(const UnicodeString& other, bool foldCase); | |
277 | ||
278 | static bool codePointsEqual(UChar32 cp1, UChar32 cp2, bool foldCase); | |
279 | }; | |
280 | ||
281 | ||
282 | /** | |
283 | * The core interface implemented by all matchers used for number parsing. | |
284 | * | |
285 | * Given a string, there should NOT be more than one way to consume the string with the same matcher | |
286 | * applied multiple times. If there is, the non-greedy parsing algorithm will be unhappy and may enter an | |
287 | * exponential-time loop. For example, consider the "A Matcher" that accepts "any number of As". Given | |
288 | * the string "AAAA", there are 2^N = 8 ways to apply the A Matcher to this string: you could have the A | |
289 | * Matcher apply 4 times to each character; you could have it apply just once to all the characters; you | |
290 | * could have it apply to the first 2 characters and the second 2 characters; and so on. A better version | |
291 | * of the "A Matcher" would be for it to accept exactly one A, and allow the algorithm to run it | |
292 | * repeatedly to consume a string of multiple As. The A Matcher can implement the Flexible interface | |
293 | * below to signal that it can be applied multiple times in a row. | |
294 | * | |
295 | * @author sffc | |
296 | */ | |
297 | // Exported as U_I18N_API for tests | |
298 | class U_I18N_API NumberParseMatcher { | |
299 | public: | |
300 | virtual ~NumberParseMatcher(); | |
301 | ||
302 | /** | |
303 | * Matchers can override this method to return true to indicate that they are optional and can be run | |
304 | * repeatedly. Used by SeriesMatcher, primarily in the context of IgnorablesMatcher. | |
305 | */ | |
306 | virtual bool isFlexible() const { | |
307 | return false; | |
308 | } | |
309 | ||
310 | /** | |
311 | * Runs this matcher starting at the beginning of the given StringSegment. If this matcher finds | |
312 | * something interesting in the StringSegment, it should update the offset of the StringSegment | |
313 | * corresponding to how many chars were matched. | |
314 | * | |
315 | * This method is thread-safe. | |
316 | * | |
317 | * @param segment | |
318 | * The StringSegment to match against. Matches always start at the beginning of the | |
319 | * segment. The segment is guaranteed to contain at least one char. | |
320 | * @param result | |
321 | * The data structure to store results if the match succeeds. | |
322 | * @return Whether this matcher thinks there may be more interesting chars beyond the end of the | |
323 | * string segment. | |
324 | */ | |
325 | virtual bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const = 0; | |
326 | ||
327 | /** | |
328 | * Performs a fast "smoke check" for whether or not this matcher could possibly match against the | |
329 | * given string segment. The test should be as fast as possible but also as restrictive as possible. | |
330 | * For example, matchers can maintain a UnicodeSet of all code points that count possibly start a | |
331 | * match. Matchers should use the {@link StringSegment#startsWith} method in order to correctly | |
332 | * handle case folding. | |
333 | * | |
334 | * @param segment | |
335 | * The segment to check against. | |
336 | * @return true if the matcher might be able to match against this segment; false if it definitely | |
337 | * will not be able to match. | |
338 | */ | |
339 | virtual bool smokeTest(const StringSegment& segment) const = 0; | |
340 | ||
341 | /** | |
342 | * Method called at the end of a parse, after all matchers have failed to consume any more chars. | |
343 | * Allows a matcher to make final modifications to the result given the knowledge that no more | |
344 | * matches are possible. | |
345 | * | |
346 | * @param result | |
347 | * The data structure to store results. | |
348 | */ | |
349 | virtual void postProcess(ParsedNumber&) const { | |
350 | // Default implementation: no-op | |
3d1f044b | 351 | } |
0f5d89e8 A |
352 | |
353 | // String for debugging | |
354 | virtual UnicodeString toString() const = 0; | |
355 | ||
356 | protected: | |
357 | // No construction except by subclasses! | |
358 | NumberParseMatcher() = default; | |
359 | }; | |
360 | ||
361 | ||
362 | /** | |
363 | * Interface for use in arguments. | |
364 | */ | |
365 | // Exported as U_I18N_API for tests | |
366 | class U_I18N_API MutableMatcherCollection { | |
367 | public: | |
368 | virtual ~MutableMatcherCollection() = default; | |
369 | ||
370 | virtual void addMatcher(NumberParseMatcher& matcher) = 0; | |
371 | }; | |
372 | ||
373 | ||
374 | } // namespace impl | |
375 | } // namespace numparse | |
376 | U_NAMESPACE_END | |
377 | ||
378 | #endif //__NUMPARSE_TYPES_H__ | |
379 | #endif /* #if !UCONFIG_NO_FORMATTING */ |