]>
Commit | Line | Data |
---|---|---|
0f5d89e8 A |
1 | // © 2018 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | #include "unicode/utypes.h" | |
5 | ||
6 | #if !UCONFIG_NO_FORMATTING | |
7 | #ifndef __NUMPARSE_TYPES_H__ | |
8 | #define __NUMPARSE_TYPES_H__ | |
9 | ||
10 | #include "unicode/uobject.h" | |
11 | #include "number_decimalquantity.h" | |
12 | ||
13 | U_NAMESPACE_BEGIN namespace numparse { | |
14 | namespace impl { | |
15 | ||
16 | // Forward-declarations | |
17 | class StringSegment; | |
18 | class ParsedNumber; | |
19 | ||
20 | typedef int32_t result_flags_t; | |
21 | typedef int32_t parse_flags_t; | |
22 | ||
23 | /** Flags for the type result_flags_t */ | |
24 | enum ResultFlags { | |
25 | FLAG_NEGATIVE = 0x0001, | |
26 | FLAG_PERCENT = 0x0002, | |
27 | FLAG_PERMILLE = 0x0004, | |
28 | FLAG_HAS_EXPONENT = 0x0008, | |
29 | // FLAG_HAS_DEFAULT_CURRENCY = 0x0010, // no longer used | |
30 | FLAG_HAS_DECIMAL_SEPARATOR = 0x0020, | |
31 | FLAG_NAN = 0x0040, | |
32 | FLAG_INFINITY = 0x0080, | |
33 | FLAG_FAIL = 0x0100, | |
34 | }; | |
35 | ||
36 | /** Flags for the type parse_flags_t */ | |
37 | enum ParseFlags { | |
38 | PARSE_FLAG_IGNORE_CASE = 0x0001, | |
39 | PARSE_FLAG_MONETARY_SEPARATORS = 0x0002, | |
40 | PARSE_FLAG_STRICT_SEPARATORS = 0x0004, | |
41 | PARSE_FLAG_STRICT_GROUPING_SIZE = 0x0008, | |
42 | PARSE_FLAG_INTEGER_ONLY = 0x0010, | |
43 | PARSE_FLAG_GROUPING_DISABLED = 0x0020, | |
44 | // PARSE_FLAG_FRACTION_GROUPING_ENABLED = 0x0040, // see #10794 | |
45 | PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES = 0x0080, | |
46 | PARSE_FLAG_USE_FULL_AFFIXES = 0x0100, | |
47 | PARSE_FLAG_EXACT_AFFIX = 0x0200, | |
48 | PARSE_FLAG_PLUS_SIGN_ALLOWED = 0x0400, | |
49 | // PARSE_FLAG_OPTIMIZE = 0x0800, // no longer used | |
50 | // PARSE_FLAG_FORCE_BIG_DECIMAL = 0x1000, // not used in ICU4C | |
51 | PARSE_FLAG_NO_FOREIGN_CURRENCY = 0x2000, | |
52 | }; | |
53 | ||
54 | ||
55 | // TODO: Is this class worthwhile? | |
56 | template<int32_t stackCapacity> | |
57 | class CompactUnicodeString { | |
58 | public: | |
59 | CompactUnicodeString() { | |
60 | static_assert(stackCapacity > 0, "cannot have zero space on stack"); | |
61 | fBuffer[0] = 0; | |
62 | } | |
63 | ||
64 | CompactUnicodeString(const UnicodeString& text) | |
65 | : fBuffer(text.length() + 1) { | |
66 | memcpy(fBuffer.getAlias(), text.getBuffer(), sizeof(UChar) * text.length()); | |
67 | fBuffer[text.length()] = 0; | |
68 | } | |
69 | ||
70 | inline UnicodeString toAliasedUnicodeString() const { | |
71 | return UnicodeString(TRUE, fBuffer.getAlias(), -1); | |
72 | } | |
73 | ||
74 | bool operator==(const CompactUnicodeString& other) const { | |
75 | // Use the alias-only constructor and then call UnicodeString operator== | |
76 | return toAliasedUnicodeString() == other.toAliasedUnicodeString(); | |
77 | } | |
78 | ||
79 | private: | |
80 | MaybeStackArray<UChar, stackCapacity> fBuffer; | |
81 | }; | |
82 | ||
83 | ||
84 | /** | |
85 | * Struct-like class to hold the results of a parsing routine. | |
86 | * | |
87 | * @author sffc | |
88 | */ | |
89 | // Exported as U_I18N_API for tests | |
90 | class U_I18N_API ParsedNumber { | |
91 | public: | |
92 | ||
93 | /** | |
94 | * The numerical value that was parsed. | |
95 | */ | |
96 | ::icu::number::impl::DecimalQuantity quantity; | |
97 | ||
98 | /** | |
99 | * The index of the last char consumed during parsing. If parsing started at index 0, this is equal | |
100 | * to the number of chars consumed. This is NOT necessarily the same as the StringSegment offset; | |
101 | * "weak" chars, like whitespace, change the offset, but the charsConsumed is not touched until a | |
102 | * "strong" char is encountered. | |
103 | */ | |
104 | int32_t charEnd; | |
105 | ||
106 | /** | |
107 | * Boolean flags (see constants above). | |
108 | */ | |
109 | result_flags_t flags; | |
110 | ||
111 | /** | |
112 | * The pattern string corresponding to the prefix that got consumed. | |
113 | */ | |
114 | UnicodeString prefix; | |
115 | ||
116 | /** | |
117 | * The pattern string corresponding to the suffix that got consumed. | |
118 | */ | |
119 | UnicodeString suffix; | |
120 | ||
121 | /** | |
122 | * The currency that got consumed. | |
123 | */ | |
124 | UChar currencyCode[4]; | |
125 | ||
126 | ParsedNumber(); | |
127 | ||
128 | ParsedNumber(const ParsedNumber& other) = default; | |
129 | ||
130 | ParsedNumber& operator=(const ParsedNumber& other) = default; | |
131 | ||
132 | void clear(); | |
133 | ||
134 | /** | |
135 | * Call this method to register that a "strong" char was consumed. This should be done after calling | |
136 | * {@link StringSegment#setOffset} or {@link StringSegment#adjustOffset} except when the char is | |
137 | * "weak", like whitespace. | |
138 | * | |
139 | * <p> | |
140 | * <strong>What is a strong versus weak char?</strong> The behavior of number parsing is to "stop" | |
141 | * after reading the number, even if there is other content following the number. For example, after | |
142 | * parsing the string "123 " (123 followed by a space), the cursor should be set to 3, not 4, even | |
143 | * though there are matchers that accept whitespace. In this example, the digits are strong, whereas | |
144 | * the whitespace is weak. Grouping separators are weak, whereas decimal separators are strong. Most | |
145 | * other chars are strong. | |
146 | * | |
147 | * @param segment | |
148 | * The current StringSegment, usually immediately following a call to setOffset. | |
149 | */ | |
150 | void setCharsConsumed(const StringSegment& segment); | |
151 | ||
152 | /** Apply certain number-related flags to the DecimalQuantity. */ | |
153 | void postProcess(); | |
154 | ||
155 | /** | |
156 | * Returns whether this the parse was successful. To be successful, at least one char must have been | |
157 | * consumed, and the failure flag must not be set. | |
158 | */ | |
159 | bool success() const; | |
160 | ||
161 | bool seenNumber() const; | |
162 | ||
163 | double getDouble() const; | |
164 | ||
165 | void populateFormattable(Formattable& output, parse_flags_t parseFlags) const; | |
166 | ||
167 | bool isBetterThan(const ParsedNumber& other); | |
168 | }; | |
169 | ||
170 | ||
171 | /** | |
172 | * A mutable class allowing for a String with a variable offset and length. The charAt, length, and | |
173 | * subSequence methods all operate relative to the fixed offset into the String. | |
174 | * | |
175 | * @author sffc | |
176 | */ | |
177 | // Exported as U_I18N_API for tests | |
178 | class U_I18N_API StringSegment : public UMemory { | |
179 | public: | |
180 | StringSegment(const UnicodeString& str, bool ignoreCase); | |
181 | ||
182 | int32_t getOffset() const; | |
183 | ||
184 | void setOffset(int32_t start); | |
185 | ||
186 | /** | |
187 | * Equivalent to <code>setOffset(getOffset()+delta)</code>. | |
188 | * | |
189 | * <p> | |
190 | * This method is usually called by a Matcher to register that a char was consumed. If the char is | |
191 | * strong (it usually is, except for things like whitespace), follow this with a call to | |
192 | * {@link ParsedNumber#setCharsConsumed}. For more information on strong chars, see that method. | |
193 | */ | |
194 | void adjustOffset(int32_t delta); | |
195 | ||
196 | /** | |
197 | * Adjusts the offset by the width of the current code point, either 1 or 2 chars. | |
198 | */ | |
199 | void adjustOffsetByCodePoint(); | |
200 | ||
201 | void setLength(int32_t length); | |
202 | ||
203 | void resetLength(); | |
204 | ||
205 | int32_t length() const; | |
206 | ||
207 | char16_t charAt(int32_t index) const; | |
208 | ||
209 | UChar32 codePointAt(int32_t index) const; | |
210 | ||
211 | UnicodeString toUnicodeString() const; | |
212 | ||
213 | const UnicodeString toTempUnicodeString() const; | |
214 | ||
215 | /** | |
216 | * Returns the first code point in the string segment, or -1 if the string starts with an invalid | |
217 | * code point. | |
218 | * | |
219 | * <p> | |
220 | * <strong>Important:</strong> Most of the time, you should use {@link #matches}, which handles case | |
221 | * folding logic, instead of this method. | |
222 | */ | |
223 | UChar32 getCodePoint() const; | |
224 | ||
225 | /** | |
226 | * Returns true if the first code point of this StringSegment equals the given code point. | |
227 | * | |
228 | * <p> | |
229 | * This method will perform case folding if case folding is enabled for the parser. | |
230 | */ | |
231 | bool startsWith(UChar32 otherCp) const; | |
232 | ||
233 | /** | |
234 | * Returns true if the first code point of this StringSegment is in the given UnicodeSet. | |
235 | */ | |
236 | bool startsWith(const UnicodeSet& uniset) const; | |
237 | ||
238 | /** | |
239 | * Returns true if there is at least one code point of overlap between this StringSegment and the | |
240 | * given UnicodeString. | |
241 | */ | |
242 | bool startsWith(const UnicodeString& other) const; | |
243 | ||
244 | /** | |
245 | * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For | |
246 | * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2, | |
247 | * since the first 2 characters are the same. | |
248 | * | |
249 | * <p> | |
250 | * This method only returns offsets along code point boundaries. | |
251 | * | |
252 | * <p> | |
253 | * This method will perform case folding if case folding was enabled in the constructor. | |
254 | * | |
255 | * <p> | |
256 | * IMPORTANT: The given UnicodeString must not be empty! It is the caller's responsibility to check. | |
257 | */ | |
258 | int32_t getCommonPrefixLength(const UnicodeString& other); | |
259 | ||
260 | /** | |
261 | * Like {@link #getCommonPrefixLength}, but never performs case folding, even if case folding is | |
262 | * enabled for the parser. | |
263 | */ | |
264 | int32_t getCaseSensitivePrefixLength(const UnicodeString& other); | |
265 | ||
266 | bool operator==(const UnicodeString& other) const; | |
267 | ||
268 | private: | |
269 | const UnicodeString fStr; | |
270 | int32_t fStart; | |
271 | int32_t fEnd; | |
272 | bool fFoldCase; | |
273 | ||
274 | int32_t getPrefixLengthInternal(const UnicodeString& other, bool foldCase); | |
275 | ||
276 | static bool codePointsEqual(UChar32 cp1, UChar32 cp2, bool foldCase); | |
277 | }; | |
278 | ||
279 | ||
280 | /** | |
281 | * The core interface implemented by all matchers used for number parsing. | |
282 | * | |
283 | * Given a string, there should NOT be more than one way to consume the string with the same matcher | |
284 | * applied multiple times. If there is, the non-greedy parsing algorithm will be unhappy and may enter an | |
285 | * exponential-time loop. For example, consider the "A Matcher" that accepts "any number of As". Given | |
286 | * the string "AAAA", there are 2^N = 8 ways to apply the A Matcher to this string: you could have the A | |
287 | * Matcher apply 4 times to each character; you could have it apply just once to all the characters; you | |
288 | * could have it apply to the first 2 characters and the second 2 characters; and so on. A better version | |
289 | * of the "A Matcher" would be for it to accept exactly one A, and allow the algorithm to run it | |
290 | * repeatedly to consume a string of multiple As. The A Matcher can implement the Flexible interface | |
291 | * below to signal that it can be applied multiple times in a row. | |
292 | * | |
293 | * @author sffc | |
294 | */ | |
295 | // Exported as U_I18N_API for tests | |
296 | class U_I18N_API NumberParseMatcher { | |
297 | public: | |
298 | virtual ~NumberParseMatcher(); | |
299 | ||
300 | /** | |
301 | * Matchers can override this method to return true to indicate that they are optional and can be run | |
302 | * repeatedly. Used by SeriesMatcher, primarily in the context of IgnorablesMatcher. | |
303 | */ | |
304 | virtual bool isFlexible() const { | |
305 | return false; | |
306 | } | |
307 | ||
308 | /** | |
309 | * Runs this matcher starting at the beginning of the given StringSegment. If this matcher finds | |
310 | * something interesting in the StringSegment, it should update the offset of the StringSegment | |
311 | * corresponding to how many chars were matched. | |
312 | * | |
313 | * This method is thread-safe. | |
314 | * | |
315 | * @param segment | |
316 | * The StringSegment to match against. Matches always start at the beginning of the | |
317 | * segment. The segment is guaranteed to contain at least one char. | |
318 | * @param result | |
319 | * The data structure to store results if the match succeeds. | |
320 | * @return Whether this matcher thinks there may be more interesting chars beyond the end of the | |
321 | * string segment. | |
322 | */ | |
323 | virtual bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const = 0; | |
324 | ||
325 | /** | |
326 | * Performs a fast "smoke check" for whether or not this matcher could possibly match against the | |
327 | * given string segment. The test should be as fast as possible but also as restrictive as possible. | |
328 | * For example, matchers can maintain a UnicodeSet of all code points that count possibly start a | |
329 | * match. Matchers should use the {@link StringSegment#startsWith} method in order to correctly | |
330 | * handle case folding. | |
331 | * | |
332 | * @param segment | |
333 | * The segment to check against. | |
334 | * @return true if the matcher might be able to match against this segment; false if it definitely | |
335 | * will not be able to match. | |
336 | */ | |
337 | virtual bool smokeTest(const StringSegment& segment) const = 0; | |
338 | ||
339 | /** | |
340 | * Method called at the end of a parse, after all matchers have failed to consume any more chars. | |
341 | * Allows a matcher to make final modifications to the result given the knowledge that no more | |
342 | * matches are possible. | |
343 | * | |
344 | * @param result | |
345 | * The data structure to store results. | |
346 | */ | |
347 | virtual void postProcess(ParsedNumber&) const { | |
348 | // Default implementation: no-op | |
349 | }; | |
350 | ||
351 | // String for debugging | |
352 | virtual UnicodeString toString() const = 0; | |
353 | ||
354 | protected: | |
355 | // No construction except by subclasses! | |
356 | NumberParseMatcher() = default; | |
357 | }; | |
358 | ||
359 | ||
360 | /** | |
361 | * Interface for use in arguments. | |
362 | */ | |
363 | // Exported as U_I18N_API for tests | |
364 | class U_I18N_API MutableMatcherCollection { | |
365 | public: | |
366 | virtual ~MutableMatcherCollection() = default; | |
367 | ||
368 | virtual void addMatcher(NumberParseMatcher& matcher) = 0; | |
369 | }; | |
370 | ||
371 | ||
372 | } // namespace impl | |
373 | } // namespace numparse | |
374 | U_NAMESPACE_END | |
375 | ||
376 | #endif //__NUMPARSE_TYPES_H__ | |
377 | #endif /* #if !UCONFIG_NO_FORMATTING */ |