]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
4388f060 A |
3 | /* |
4 | ******************************************************************************* | |
5 | * Copyright (C) 2010-2012, International Business Machines | |
6 | * Corporation and others. All Rights Reserved. | |
7 | ******************************************************************************* | |
8 | * file name: ucharstrie.h | |
f3c0d7a5 | 9 | * encoding: UTF-8 |
4388f060 A |
10 | * tab size: 8 (not used) |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2010nov14 | |
14 | * created by: Markus W. Scherer | |
15 | */ | |
16 | ||
17 | #ifndef __UCHARSTRIE_H__ | |
18 | #define __UCHARSTRIE_H__ | |
19 | ||
20 | /** | |
21 | * \file | |
22 | * \brief C++ API: Trie for mapping Unicode strings (or 16-bit-unit sequences) | |
23 | * to integer values. | |
24 | */ | |
25 | ||
26 | #include "unicode/utypes.h" | |
27 | #include "unicode/unistr.h" | |
28 | #include "unicode/uobject.h" | |
29 | #include "unicode/ustringtrie.h" | |
30 | ||
f3c0d7a5 | 31 | #if U_SHOW_CPLUSPLUS_API |
4388f060 A |
32 | U_NAMESPACE_BEGIN |
33 | ||
34 | class Appendable; | |
35 | class UCharsTrieBuilder; | |
36 | class UVector32; | |
37 | ||
38 | /** | |
39 | * Light-weight, non-const reader class for a UCharsTrie. | |
f3c0d7a5 | 40 | * Traverses a char16_t-serialized data structure with minimal state, |
4388f060 A |
41 | * for mapping strings (16-bit-unit sequences) to non-negative integer values. |
42 | * | |
43 | * This class owns the serialized trie data only if it was constructed by | |
44 | * the builder's build() method. | |
45 | * The public constructor and the copy constructor only alias the data (only copy the pointer). | |
46 | * There is no assignment operator. | |
47 | * | |
48 | * This class is not intended for public subclassing. | |
49 | * @stable ICU 4.8 | |
50 | */ | |
51 | class U_COMMON_API UCharsTrie : public UMemory { | |
52 | public: | |
53 | /** | |
54 | * Constructs a UCharsTrie reader instance. | |
55 | * | |
f3c0d7a5 A |
56 | * The trieUChars must contain a copy of a char16_t sequence from the UCharsTrieBuilder, |
57 | * starting with the first char16_t of that sequence. | |
58 | * The UCharsTrie object will not read more char16_ts than | |
4388f060 A |
59 | * the UCharsTrieBuilder generated in the corresponding build() call. |
60 | * | |
61 | * The array is not copied/cloned and must not be modified while | |
62 | * the UCharsTrie object is in use. | |
63 | * | |
f3c0d7a5 | 64 | * @param trieUChars The char16_t array that contains the serialized trie. |
4388f060 A |
65 | * @stable ICU 4.8 |
66 | */ | |
f3c0d7a5 | 67 | UCharsTrie(ConstChar16Ptr trieUChars) |
4388f060 A |
68 | : ownedArray_(NULL), uchars_(trieUChars), |
69 | pos_(uchars_), remainingMatchLength_(-1) {} | |
70 | ||
71 | /** | |
72 | * Destructor. | |
73 | * @stable ICU 4.8 | |
74 | */ | |
75 | ~UCharsTrie(); | |
76 | ||
77 | /** | |
78 | * Copy constructor, copies the other trie reader object and its state, | |
f3c0d7a5 | 79 | * but not the char16_t array which will be shared. (Shallow copy.) |
4388f060 A |
80 | * @param other Another UCharsTrie object. |
81 | * @stable ICU 4.8 | |
82 | */ | |
83 | UCharsTrie(const UCharsTrie &other) | |
84 | : ownedArray_(NULL), uchars_(other.uchars_), | |
85 | pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} | |
86 | ||
87 | /** | |
88 | * Resets this trie to its initial state. | |
89 | * @return *this | |
90 | * @stable ICU 4.8 | |
91 | */ | |
92 | UCharsTrie &reset() { | |
93 | pos_=uchars_; | |
94 | remainingMatchLength_=-1; | |
95 | return *this; | |
96 | } | |
97 | ||
98 | /** | |
99 | * UCharsTrie state object, for saving a trie's current state | |
100 | * and resetting the trie back to this state later. | |
101 | * @stable ICU 4.8 | |
102 | */ | |
103 | class State : public UMemory { | |
104 | public: | |
105 | /** | |
106 | * Constructs an empty State. | |
107 | * @stable ICU 4.8 | |
108 | */ | |
109 | State() { uchars=NULL; } | |
110 | private: | |
111 | friend class UCharsTrie; | |
112 | ||
f3c0d7a5 A |
113 | const char16_t *uchars; |
114 | const char16_t *pos; | |
4388f060 A |
115 | int32_t remainingMatchLength; |
116 | }; | |
117 | ||
118 | /** | |
119 | * Saves the state of this trie. | |
120 | * @param state The State object to hold the trie's state. | |
121 | * @return *this | |
122 | * @see resetToState | |
123 | * @stable ICU 4.8 | |
124 | */ | |
125 | const UCharsTrie &saveState(State &state) const { | |
126 | state.uchars=uchars_; | |
127 | state.pos=pos_; | |
128 | state.remainingMatchLength=remainingMatchLength_; | |
129 | return *this; | |
130 | } | |
131 | ||
132 | /** | |
133 | * Resets this trie to the saved state. | |
134 | * If the state object contains no state, or the state of a different trie, | |
135 | * then this trie remains unchanged. | |
136 | * @param state The State object which holds a saved trie state. | |
137 | * @return *this | |
138 | * @see saveState | |
139 | * @see reset | |
140 | * @stable ICU 4.8 | |
141 | */ | |
142 | UCharsTrie &resetToState(const State &state) { | |
143 | if(uchars_==state.uchars && uchars_!=NULL) { | |
144 | pos_=state.pos; | |
145 | remainingMatchLength_=state.remainingMatchLength; | |
146 | } | |
147 | return *this; | |
148 | } | |
149 | ||
150 | /** | |
151 | * Determines whether the string so far matches, whether it has a value, | |
f3c0d7a5 | 152 | * and whether another input char16_t can continue a matching string. |
4388f060 A |
153 | * @return The match/value Result. |
154 | * @stable ICU 4.8 | |
155 | */ | |
156 | UStringTrieResult current() const; | |
157 | ||
158 | /** | |
f3c0d7a5 | 159 | * Traverses the trie from the initial state for this input char16_t. |
4388f060 A |
160 | * Equivalent to reset().next(uchar). |
161 | * @param uchar Input char value. Values below 0 and above 0xffff will never match. | |
162 | * @return The match/value Result. | |
163 | * @stable ICU 4.8 | |
164 | */ | |
165 | inline UStringTrieResult first(int32_t uchar) { | |
166 | remainingMatchLength_=-1; | |
167 | return nextImpl(uchars_, uchar); | |
168 | } | |
169 | ||
170 | /** | |
171 | * Traverses the trie from the initial state for the | |
172 | * one or two UTF-16 code units for this input code point. | |
173 | * Equivalent to reset().nextForCodePoint(cp). | |
174 | * @param cp A Unicode code point 0..0x10ffff. | |
175 | * @return The match/value Result. | |
176 | * @stable ICU 4.8 | |
177 | */ | |
178 | UStringTrieResult firstForCodePoint(UChar32 cp); | |
179 | ||
180 | /** | |
f3c0d7a5 | 181 | * Traverses the trie from the current state for this input char16_t. |
4388f060 A |
182 | * @param uchar Input char value. Values below 0 and above 0xffff will never match. |
183 | * @return The match/value Result. | |
184 | * @stable ICU 4.8 | |
185 | */ | |
186 | UStringTrieResult next(int32_t uchar); | |
187 | ||
188 | /** | |
189 | * Traverses the trie from the current state for the | |
190 | * one or two UTF-16 code units for this input code point. | |
191 | * @param cp A Unicode code point 0..0x10ffff. | |
192 | * @return The match/value Result. | |
193 | * @stable ICU 4.8 | |
194 | */ | |
195 | UStringTrieResult nextForCodePoint(UChar32 cp); | |
196 | ||
197 | /** | |
198 | * Traverses the trie from the current state for this string. | |
199 | * Equivalent to | |
200 | * \code | |
201 | * Result result=current(); | |
202 | * for(each c in s) | |
203 | * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; | |
204 | * result=next(c); | |
205 | * return result; | |
206 | * \endcode | |
207 | * @param s A string. Can be NULL if length is 0. | |
208 | * @param length The length of the string. Can be -1 if NUL-terminated. | |
209 | * @return The match/value Result. | |
210 | * @stable ICU 4.8 | |
211 | */ | |
f3c0d7a5 | 212 | UStringTrieResult next(ConstChar16Ptr s, int32_t length); |
4388f060 A |
213 | |
214 | /** | |
215 | * Returns a matching string's value if called immediately after | |
216 | * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. | |
217 | * getValue() can be called multiple times. | |
218 | * | |
219 | * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! | |
220 | * @return The value for the string so far. | |
221 | * @stable ICU 4.8 | |
222 | */ | |
223 | inline int32_t getValue() const { | |
f3c0d7a5 | 224 | const char16_t *pos=pos_; |
4388f060 A |
225 | int32_t leadUnit=*pos++; |
226 | // U_ASSERT(leadUnit>=kMinValueLead); | |
227 | return leadUnit&kValueIsFinal ? | |
228 | readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit); | |
229 | } | |
230 | ||
231 | /** | |
232 | * Determines whether all strings reachable from the current state | |
233 | * map to the same value. | |
234 | * @param uniqueValue Receives the unique value, if this function returns TRUE. | |
235 | * (output-only) | |
236 | * @return TRUE if all strings reachable from the current state | |
237 | * map to the same value. | |
238 | * @stable ICU 4.8 | |
239 | */ | |
240 | inline UBool hasUniqueValue(int32_t &uniqueValue) const { | |
f3c0d7a5 | 241 | const char16_t *pos=pos_; |
4388f060 A |
242 | // Skip the rest of a pending linear-match node. |
243 | return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue); | |
244 | } | |
245 | ||
246 | /** | |
f3c0d7a5 A |
247 | * Finds each char16_t which continues the string from the current state. |
248 | * That is, each char16_t c for which it would be next(c)!=USTRINGTRIE_NO_MATCH now. | |
249 | * @param out Each next char16_t is appended to this object. | |
250 | * @return the number of char16_ts which continue the string from here | |
4388f060 A |
251 | * @stable ICU 4.8 |
252 | */ | |
253 | int32_t getNextUChars(Appendable &out) const; | |
254 | ||
255 | /** | |
256 | * Iterator for all of the (string, value) pairs in a UCharsTrie. | |
257 | * @stable ICU 4.8 | |
258 | */ | |
259 | class U_COMMON_API Iterator : public UMemory { | |
260 | public: | |
261 | /** | |
f3c0d7a5 A |
262 | * Iterates from the root of a char16_t-serialized UCharsTrie. |
263 | * @param trieUChars The trie char16_ts. | |
4388f060 A |
264 | * @param maxStringLength If 0, the iterator returns full strings. |
265 | * Otherwise, the iterator returns strings with this maximum length. | |
266 | * @param errorCode Standard ICU error code. Its input value must | |
267 | * pass the U_SUCCESS() test, or else the function returns | |
268 | * immediately. Check for U_FAILURE() on output or use with | |
269 | * function chaining. (See User Guide for details.) | |
270 | * @stable ICU 4.8 | |
271 | */ | |
f3c0d7a5 | 272 | Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode); |
4388f060 A |
273 | |
274 | /** | |
275 | * Iterates from the current state of the specified UCharsTrie. | |
276 | * @param trie The trie whose state will be copied for iteration. | |
277 | * @param maxStringLength If 0, the iterator returns full strings. | |
278 | * Otherwise, the iterator returns strings with this maximum length. | |
279 | * @param errorCode Standard ICU error code. Its input value must | |
280 | * pass the U_SUCCESS() test, or else the function returns | |
281 | * immediately. Check for U_FAILURE() on output or use with | |
282 | * function chaining. (See User Guide for details.) | |
283 | * @stable ICU 4.8 | |
284 | */ | |
285 | Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); | |
286 | ||
287 | /** | |
288 | * Destructor. | |
289 | * @stable ICU 4.8 | |
290 | */ | |
291 | ~Iterator(); | |
292 | ||
293 | /** | |
294 | * Resets this iterator to its initial state. | |
295 | * @return *this | |
296 | * @stable ICU 4.8 | |
297 | */ | |
298 | Iterator &reset(); | |
299 | ||
300 | /** | |
301 | * @return TRUE if there are more elements. | |
302 | * @stable ICU 4.8 | |
303 | */ | |
304 | UBool hasNext() const; | |
305 | ||
306 | /** | |
307 | * Finds the next (string, value) pair if there is one. | |
308 | * | |
309 | * If the string is truncated to the maximum length and does not | |
310 | * have a real value, then the value is set to -1. | |
311 | * In this case, this "not a real value" is indistinguishable from | |
312 | * a real value of -1. | |
313 | * @param errorCode Standard ICU error code. Its input value must | |
314 | * pass the U_SUCCESS() test, or else the function returns | |
315 | * immediately. Check for U_FAILURE() on output or use with | |
316 | * function chaining. (See User Guide for details.) | |
317 | * @return TRUE if there is another element. | |
318 | * @stable ICU 4.8 | |
319 | */ | |
320 | UBool next(UErrorCode &errorCode); | |
321 | ||
322 | /** | |
323 | * @return The string for the last successful next(). | |
324 | * @stable ICU 4.8 | |
325 | */ | |
326 | const UnicodeString &getString() const { return str_; } | |
327 | /** | |
328 | * @return The value for the last successful next(). | |
329 | * @stable ICU 4.8 | |
330 | */ | |
331 | int32_t getValue() const { return value_; } | |
332 | ||
333 | private: | |
334 | UBool truncateAndStop() { | |
335 | pos_=NULL; | |
336 | value_=-1; // no real value for str | |
337 | return TRUE; | |
338 | } | |
339 | ||
f3c0d7a5 | 340 | const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode); |
4388f060 | 341 | |
f3c0d7a5 A |
342 | const char16_t *uchars_; |
343 | const char16_t *pos_; | |
344 | const char16_t *initialPos_; | |
4388f060 A |
345 | int32_t remainingMatchLength_; |
346 | int32_t initialRemainingMatchLength_; | |
347 | UBool skipValue_; // Skip intermediate value which was already delivered. | |
348 | ||
349 | UnicodeString str_; | |
350 | int32_t maxLength_; | |
351 | int32_t value_; | |
352 | ||
353 | // The stack stores pairs of integers for backtracking to another | |
354 | // outbound edge of a branch node. | |
355 | // The first integer is an offset from uchars_. | |
356 | // The second integer has the str_.length() from before the node in bits 15..0, | |
357 | // and the remaining branch length in bits 31..16. | |
358 | // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit, | |
359 | // but the code looks more confusing that way.) | |
360 | UVector32 *stack_; | |
361 | }; | |
362 | ||
363 | private: | |
364 | friend class UCharsTrieBuilder; | |
365 | ||
366 | /** | |
367 | * Constructs a UCharsTrie reader instance. | |
368 | * Unlike the public constructor which just aliases an array, | |
369 | * this constructor adopts the builder's array. | |
370 | * This constructor is only called by the builder. | |
371 | */ | |
f3c0d7a5 | 372 | UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars) |
4388f060 A |
373 | : ownedArray_(adoptUChars), uchars_(trieUChars), |
374 | pos_(uchars_), remainingMatchLength_(-1) {} | |
375 | ||
376 | // No assignment operator. | |
377 | UCharsTrie &operator=(const UCharsTrie &other); | |
378 | ||
379 | inline void stop() { | |
380 | pos_=NULL; | |
381 | } | |
382 | ||
383 | // Reads a compact 32-bit integer. | |
384 | // pos is already after the leadUnit, and the lead unit has bit 15 reset. | |
f3c0d7a5 | 385 | static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) { |
4388f060 A |
386 | int32_t value; |
387 | if(leadUnit<kMinTwoUnitValueLead) { | |
388 | value=leadUnit; | |
389 | } else if(leadUnit<kThreeUnitValueLead) { | |
390 | value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos; | |
391 | } else { | |
392 | value=(pos[0]<<16)|pos[1]; | |
393 | } | |
394 | return value; | |
395 | } | |
f3c0d7a5 | 396 | static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) { |
4388f060 A |
397 | if(leadUnit>=kMinTwoUnitValueLead) { |
398 | if(leadUnit<kThreeUnitValueLead) { | |
399 | ++pos; | |
400 | } else { | |
401 | pos+=2; | |
402 | } | |
403 | } | |
404 | return pos; | |
405 | } | |
f3c0d7a5 | 406 | static inline const char16_t *skipValue(const char16_t *pos) { |
4388f060 A |
407 | int32_t leadUnit=*pos++; |
408 | return skipValue(pos, leadUnit&0x7fff); | |
409 | } | |
410 | ||
f3c0d7a5 | 411 | static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) { |
4388f060 A |
412 | // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); |
413 | int32_t value; | |
414 | if(leadUnit<kMinTwoUnitNodeValueLead) { | |
415 | value=(leadUnit>>6)-1; | |
416 | } else if(leadUnit<kThreeUnitNodeValueLead) { | |
417 | value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos; | |
418 | } else { | |
419 | value=(pos[0]<<16)|pos[1]; | |
420 | } | |
421 | return value; | |
422 | } | |
f3c0d7a5 | 423 | static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) { |
4388f060 A |
424 | // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); |
425 | if(leadUnit>=kMinTwoUnitNodeValueLead) { | |
426 | if(leadUnit<kThreeUnitNodeValueLead) { | |
427 | ++pos; | |
428 | } else { | |
429 | pos+=2; | |
430 | } | |
431 | } | |
432 | return pos; | |
433 | } | |
434 | ||
f3c0d7a5 | 435 | static inline const char16_t *jumpByDelta(const char16_t *pos) { |
4388f060 A |
436 | int32_t delta=*pos++; |
437 | if(delta>=kMinTwoUnitDeltaLead) { | |
438 | if(delta==kThreeUnitDeltaLead) { | |
439 | delta=(pos[0]<<16)|pos[1]; | |
440 | pos+=2; | |
441 | } else { | |
442 | delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++; | |
443 | } | |
444 | } | |
445 | return pos+delta; | |
446 | } | |
447 | ||
f3c0d7a5 | 448 | static const char16_t *skipDelta(const char16_t *pos) { |
4388f060 A |
449 | int32_t delta=*pos++; |
450 | if(delta>=kMinTwoUnitDeltaLead) { | |
451 | if(delta==kThreeUnitDeltaLead) { | |
452 | pos+=2; | |
453 | } else { | |
454 | ++pos; | |
455 | } | |
456 | } | |
457 | return pos; | |
458 | } | |
459 | ||
460 | static inline UStringTrieResult valueResult(int32_t node) { | |
461 | return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node>>15)); | |
462 | } | |
463 | ||
464 | // Handles a branch node for both next(uchar) and next(string). | |
f3c0d7a5 | 465 | UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar); |
4388f060 A |
466 | |
467 | // Requires remainingLength_<0. | |
f3c0d7a5 | 468 | UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar); |
4388f060 A |
469 | |
470 | // Helper functions for hasUniqueValue(). | |
471 | // Recursively finds a unique value (or whether there is not a unique one) | |
472 | // from a branch. | |
f3c0d7a5 | 473 | static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length, |
4388f060 A |
474 | UBool haveUniqueValue, int32_t &uniqueValue); |
475 | // Recursively finds a unique value (or whether there is not a unique one) | |
476 | // starting from a position on a node lead unit. | |
f3c0d7a5 | 477 | static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); |
4388f060 A |
478 | |
479 | // Helper functions for getNextUChars(). | |
480 | // getNextUChars() when pos is on a branch node. | |
f3c0d7a5 | 481 | static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out); |
4388f060 A |
482 | |
483 | // UCharsTrie data structure | |
484 | // | |
f3c0d7a5 A |
485 | // The trie consists of a series of char16_t-serialized nodes for incremental |
486 | // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer) | |
4388f060 A |
487 | // The root node is at the beginning of the trie data. |
488 | // | |
489 | // Types of nodes are distinguished by their node lead unit ranges. | |
490 | // After each node, except a final-value node, another node follows to | |
491 | // encode match values or continue matching further units. | |
492 | // | |
493 | // Node types: | |
494 | // - Final-value node: Stores a 32-bit integer in a compact, variable-length format. | |
f3c0d7a5 | 495 | // The value is for the string/char16_t sequence so far. |
4388f060 | 496 | // - Match node, optionally with an intermediate value in a different compact format. |
f3c0d7a5 | 497 | // The value, if present, is for the string/char16_t sequence so far. |
4388f060 A |
498 | // |
499 | // Aside from the value, which uses the node lead unit's high bits: | |
500 | // | |
501 | // - Linear-match node: Matches a number of units. | |
502 | // - Branch node: Branches to other nodes according to the current input unit. | |
503 | // The node unit is the length of the branch (number of units to select from) | |
504 | // minus 1. It is followed by a sub-node: | |
505 | // - If the length is at most kMaxBranchLinearSubNodeLength, then | |
506 | // there are length-1 (key, value) pairs and then one more comparison unit. | |
507 | // If one of the key units matches, then the value is either a final value for | |
508 | // the string so far, or a "jump" delta to the next node. | |
509 | // If the last unit matches, then matching continues with the next node. | |
510 | // (Values have the same encoding as final-value nodes.) | |
511 | // - If the length is greater than kMaxBranchLinearSubNodeLength, then | |
512 | // there is one unit and one "jump" delta. | |
513 | // If the input unit is less than the sub-node unit, then "jump" by delta to | |
514 | // the next sub-node which will have a length of length/2. | |
515 | // (The delta has its own compact encoding.) | |
516 | // Otherwise, skip the "jump" delta to the next sub-node | |
517 | // which will have a length of length-length/2. | |
518 | ||
519 | // Match-node lead unit values, after masking off intermediate-value bits: | |
520 | ||
521 | // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise | |
522 | // the length is one more than the next unit. | |
523 | ||
524 | // For a branch sub-node with at most this many entries, we drop down | |
525 | // to a linear search. | |
526 | static const int32_t kMaxBranchLinearSubNodeLength=5; | |
527 | ||
528 | // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. | |
529 | static const int32_t kMinLinearMatch=0x30; | |
530 | static const int32_t kMaxLinearMatchLength=0x10; | |
531 | ||
532 | // Match-node lead unit bits 14..6 for the optional intermediate value. | |
533 | // If these bits are 0, then there is no intermediate value. | |
534 | // Otherwise, see the *NodeValue* constants below. | |
535 | static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040 | |
536 | static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f | |
537 | ||
538 | // A final-value node has bit 15 set. | |
539 | static const int32_t kValueIsFinal=0x8000; | |
540 | ||
541 | // Compact value: After testing and masking off bit 15, use the following thresholds. | |
542 | static const int32_t kMaxOneUnitValue=0x3fff; | |
543 | ||
544 | static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000 | |
545 | static const int32_t kThreeUnitValueLead=0x7fff; | |
546 | ||
547 | static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff | |
548 | ||
549 | // Compact intermediate-value integer, lead unit shared with a branch or linear-match node. | |
550 | static const int32_t kMaxOneUnitNodeValue=0xff; | |
551 | static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040 | |
552 | static const int32_t kThreeUnitNodeValueLead=0x7fc0; | |
553 | ||
554 | static const int32_t kMaxTwoUnitNodeValue= | |
555 | ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff | |
556 | ||
557 | // Compact delta integers. | |
558 | static const int32_t kMaxOneUnitDelta=0xfbff; | |
559 | static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00 | |
560 | static const int32_t kThreeUnitDeltaLead=0xffff; | |
561 | ||
562 | static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff | |
563 | ||
f3c0d7a5 | 564 | char16_t *ownedArray_; |
4388f060 A |
565 | |
566 | // Fixed value referencing the UCharsTrie words. | |
f3c0d7a5 | 567 | const char16_t *uchars_; |
4388f060 A |
568 | |
569 | // Iterator variables. | |
570 | ||
571 | // Pointer to next trie unit to read. NULL if no more matches. | |
f3c0d7a5 | 572 | const char16_t *pos_; |
4388f060 A |
573 | // Remaining length of a linear-match node, minus 1. Negative if not in such a node. |
574 | int32_t remainingMatchLength_; | |
575 | }; | |
576 | ||
577 | U_NAMESPACE_END | |
f3c0d7a5 | 578 | #endif // U_SHOW_CPLUSPLUS_API |
4388f060 A |
579 | |
580 | #endif // __UCHARSTRIE_H__ |