]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2010-2011, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ******************************************************************************* | |
6 | * file name: ucharstrie.h | |
7 | * encoding: US-ASCII | |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * created on: 2010nov14 | |
12 | * created by: Markus W. Scherer | |
13 | */ | |
14 | ||
15 | #include "unicode/utypes.h" | |
16 | #include "unicode/appendable.h" | |
17 | #include "unicode/ucharstrie.h" | |
18 | #include "unicode/uobject.h" | |
19 | #include "unicode/utf16.h" | |
20 | #include "cmemory.h" | |
21 | #include "uassert.h" | |
22 | ||
23 | U_NAMESPACE_BEGIN | |
24 | ||
25 | UCharsTrie::~UCharsTrie() { | |
26 | uprv_free(ownedArray_); | |
27 | } | |
28 | ||
29 | UStringTrieResult | |
30 | UCharsTrie::current() const { | |
31 | const UChar *pos=pos_; | |
32 | if(pos==NULL) { | |
33 | return USTRINGTRIE_NO_MATCH; | |
34 | } else { | |
35 | int32_t node; | |
36 | return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? | |
37 | valueResult(node) : USTRINGTRIE_NO_VALUE; | |
38 | } | |
39 | } | |
40 | ||
41 | UStringTrieResult | |
42 | UCharsTrie::firstForCodePoint(UChar32 cp) { | |
43 | return cp<=0xffff ? | |
44 | first(cp) : | |
45 | (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ? | |
46 | next(U16_TRAIL(cp)) : | |
47 | USTRINGTRIE_NO_MATCH); | |
48 | } | |
49 | ||
50 | UStringTrieResult | |
51 | UCharsTrie::nextForCodePoint(UChar32 cp) { | |
52 | return cp<=0xffff ? | |
53 | next(cp) : | |
54 | (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ? | |
55 | next(U16_TRAIL(cp)) : | |
56 | USTRINGTRIE_NO_MATCH); | |
57 | } | |
58 | ||
59 | UStringTrieResult | |
60 | UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) { | |
61 | // Branch according to the current unit. | |
62 | if(length==0) { | |
63 | length=*pos++; | |
64 | } | |
65 | ++length; | |
66 | // The length of the branch is the number of units to select from. | |
67 | // The data structure encodes a binary search. | |
68 | while(length>kMaxBranchLinearSubNodeLength) { | |
69 | if(uchar<*pos++) { | |
70 | length>>=1; | |
71 | pos=jumpByDelta(pos); | |
72 | } else { | |
73 | length=length-(length>>1); | |
74 | pos=skipDelta(pos); | |
75 | } | |
76 | } | |
77 | // Drop down to linear search for the last few units. | |
78 | // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 | |
79 | // and divides length by 2. | |
80 | do { | |
81 | if(uchar==*pos++) { | |
82 | UStringTrieResult result; | |
83 | int32_t node=*pos; | |
84 | if(node&kValueIsFinal) { | |
85 | // Leave the final value for getValue() to read. | |
86 | result=USTRINGTRIE_FINAL_VALUE; | |
87 | } else { | |
88 | // Use the non-final value as the jump delta. | |
89 | ++pos; | |
90 | // int32_t delta=readValue(pos, node); | |
91 | int32_t delta; | |
92 | if(node<kMinTwoUnitValueLead) { | |
93 | delta=node; | |
94 | } else if(node<kThreeUnitValueLead) { | |
95 | delta=((node-kMinTwoUnitValueLead)<<16)|*pos++; | |
96 | } else { | |
97 | delta=(pos[0]<<16)|pos[1]; | |
98 | pos+=2; | |
99 | } | |
100 | // end readValue() | |
101 | pos+=delta; | |
102 | node=*pos; | |
103 | result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; | |
104 | } | |
105 | pos_=pos; | |
106 | return result; | |
107 | } | |
108 | --length; | |
109 | pos=skipValue(pos); | |
110 | } while(length>1); | |
111 | if(uchar==*pos++) { | |
112 | pos_=pos; | |
113 | int32_t node=*pos; | |
114 | return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; | |
115 | } else { | |
116 | stop(); | |
117 | return USTRINGTRIE_NO_MATCH; | |
118 | } | |
119 | } | |
120 | ||
121 | UStringTrieResult | |
122 | UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) { | |
123 | int32_t node=*pos++; | |
124 | for(;;) { | |
125 | if(node<kMinLinearMatch) { | |
126 | return branchNext(pos, node, uchar); | |
127 | } else if(node<kMinValueLead) { | |
128 | // Match the first of length+1 units. | |
129 | int32_t length=node-kMinLinearMatch; // Actual match length minus 1. | |
130 | if(uchar==*pos++) { | |
131 | remainingMatchLength_=--length; | |
132 | pos_=pos; | |
133 | return (length<0 && (node=*pos)>=kMinValueLead) ? | |
134 | valueResult(node) : USTRINGTRIE_NO_VALUE; | |
135 | } else { | |
136 | // No match. | |
137 | break; | |
138 | } | |
139 | } else if(node&kValueIsFinal) { | |
140 | // No further matching units. | |
141 | break; | |
142 | } else { | |
143 | // Skip intermediate value. | |
144 | pos=skipNodeValue(pos, node); | |
145 | node&=kNodeTypeMask; | |
146 | } | |
147 | } | |
148 | stop(); | |
149 | return USTRINGTRIE_NO_MATCH; | |
150 | } | |
151 | ||
152 | UStringTrieResult | |
153 | UCharsTrie::next(int32_t uchar) { | |
154 | const UChar *pos=pos_; | |
155 | if(pos==NULL) { | |
156 | return USTRINGTRIE_NO_MATCH; | |
157 | } | |
158 | int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. | |
159 | if(length>=0) { | |
160 | // Remaining part of a linear-match node. | |
161 | if(uchar==*pos++) { | |
162 | remainingMatchLength_=--length; | |
163 | pos_=pos; | |
164 | int32_t node; | |
165 | return (length<0 && (node=*pos)>=kMinValueLead) ? | |
166 | valueResult(node) : USTRINGTRIE_NO_VALUE; | |
167 | } else { | |
168 | stop(); | |
169 | return USTRINGTRIE_NO_MATCH; | |
170 | } | |
171 | } | |
172 | return nextImpl(pos, uchar); | |
173 | } | |
174 | ||
175 | UStringTrieResult | |
176 | UCharsTrie::next(const UChar *s, int32_t sLength) { | |
177 | if(sLength<0 ? *s==0 : sLength==0) { | |
178 | // Empty input. | |
179 | return current(); | |
180 | } | |
181 | const UChar *pos=pos_; | |
182 | if(pos==NULL) { | |
183 | return USTRINGTRIE_NO_MATCH; | |
184 | } | |
185 | int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. | |
186 | for(;;) { | |
187 | // Fetch the next input unit, if there is one. | |
188 | // Continue a linear-match node without rechecking sLength<0. | |
189 | int32_t uchar; | |
190 | if(sLength<0) { | |
191 | for(;;) { | |
192 | if((uchar=*s++)==0) { | |
193 | remainingMatchLength_=length; | |
194 | pos_=pos; | |
195 | int32_t node; | |
196 | return (length<0 && (node=*pos)>=kMinValueLead) ? | |
197 | valueResult(node) : USTRINGTRIE_NO_VALUE; | |
198 | } | |
199 | if(length<0) { | |
200 | remainingMatchLength_=length; | |
201 | break; | |
202 | } | |
203 | if(uchar!=*pos) { | |
204 | stop(); | |
205 | return USTRINGTRIE_NO_MATCH; | |
206 | } | |
207 | ++pos; | |
208 | --length; | |
209 | } | |
210 | } else { | |
211 | for(;;) { | |
212 | if(sLength==0) { | |
213 | remainingMatchLength_=length; | |
214 | pos_=pos; | |
215 | int32_t node; | |
216 | return (length<0 && (node=*pos)>=kMinValueLead) ? | |
217 | valueResult(node) : USTRINGTRIE_NO_VALUE; | |
218 | } | |
219 | uchar=*s++; | |
220 | --sLength; | |
221 | if(length<0) { | |
222 | remainingMatchLength_=length; | |
223 | break; | |
224 | } | |
225 | if(uchar!=*pos) { | |
226 | stop(); | |
227 | return USTRINGTRIE_NO_MATCH; | |
228 | } | |
229 | ++pos; | |
230 | --length; | |
231 | } | |
232 | } | |
233 | int32_t node=*pos++; | |
234 | for(;;) { | |
235 | if(node<kMinLinearMatch) { | |
236 | UStringTrieResult result=branchNext(pos, node, uchar); | |
237 | if(result==USTRINGTRIE_NO_MATCH) { | |
238 | return USTRINGTRIE_NO_MATCH; | |
239 | } | |
240 | // Fetch the next input unit, if there is one. | |
241 | if(sLength<0) { | |
242 | if((uchar=*s++)==0) { | |
243 | return result; | |
244 | } | |
245 | } else { | |
246 | if(sLength==0) { | |
247 | return result; | |
248 | } | |
249 | uchar=*s++; | |
250 | --sLength; | |
251 | } | |
252 | if(result==USTRINGTRIE_FINAL_VALUE) { | |
253 | // No further matching units. | |
254 | stop(); | |
255 | return USTRINGTRIE_NO_MATCH; | |
256 | } | |
257 | pos=pos_; // branchNext() advanced pos and wrote it to pos_ . | |
258 | node=*pos++; | |
259 | } else if(node<kMinValueLead) { | |
260 | // Match length+1 units. | |
261 | length=node-kMinLinearMatch; // Actual match length minus 1. | |
262 | if(uchar!=*pos) { | |
263 | stop(); | |
264 | return USTRINGTRIE_NO_MATCH; | |
265 | } | |
266 | ++pos; | |
267 | --length; | |
268 | break; | |
269 | } else if(node&kValueIsFinal) { | |
270 | // No further matching units. | |
271 | stop(); | |
272 | return USTRINGTRIE_NO_MATCH; | |
273 | } else { | |
274 | // Skip intermediate value. | |
275 | pos=skipNodeValue(pos, node); | |
276 | node&=kNodeTypeMask; | |
277 | } | |
278 | } | |
279 | } | |
280 | } | |
281 | ||
282 | const UChar * | |
283 | UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length, | |
284 | UBool haveUniqueValue, int32_t &uniqueValue) { | |
285 | while(length>kMaxBranchLinearSubNodeLength) { | |
286 | ++pos; // ignore the comparison unit | |
287 | if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { | |
288 | return NULL; | |
289 | } | |
290 | length=length-(length>>1); | |
291 | pos=skipDelta(pos); | |
292 | } | |
293 | do { | |
294 | ++pos; // ignore a comparison unit | |
295 | // handle its value | |
296 | int32_t node=*pos++; | |
297 | UBool isFinal=(UBool)(node>>15); | |
298 | node&=0x7fff; | |
299 | int32_t value=readValue(pos, node); | |
300 | pos=skipValue(pos, node); | |
301 | if(isFinal) { | |
302 | if(haveUniqueValue) { | |
303 | if(value!=uniqueValue) { | |
304 | return NULL; | |
305 | } | |
306 | } else { | |
307 | uniqueValue=value; | |
308 | haveUniqueValue=TRUE; | |
309 | } | |
310 | } else { | |
311 | if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { | |
312 | return NULL; | |
313 | } | |
314 | haveUniqueValue=TRUE; | |
315 | } | |
316 | } while(--length>1); | |
317 | return pos+1; // ignore the last comparison unit | |
318 | } | |
319 | ||
320 | UBool | |
321 | UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) { | |
322 | int32_t node=*pos++; | |
323 | for(;;) { | |
324 | if(node<kMinLinearMatch) { | |
325 | if(node==0) { | |
326 | node=*pos++; | |
327 | } | |
328 | pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); | |
329 | if(pos==NULL) { | |
330 | return FALSE; | |
331 | } | |
332 | haveUniqueValue=TRUE; | |
333 | node=*pos++; | |
334 | } else if(node<kMinValueLead) { | |
335 | // linear-match node | |
336 | pos+=node-kMinLinearMatch+1; // Ignore the match units. | |
337 | node=*pos++; | |
338 | } else { | |
339 | UBool isFinal=(UBool)(node>>15); | |
340 | int32_t value; | |
341 | if(isFinal) { | |
342 | value=readValue(pos, node&0x7fff); | |
343 | } else { | |
344 | value=readNodeValue(pos, node); | |
345 | } | |
346 | if(haveUniqueValue) { | |
347 | if(value!=uniqueValue) { | |
348 | return FALSE; | |
349 | } | |
350 | } else { | |
351 | uniqueValue=value; | |
352 | haveUniqueValue=TRUE; | |
353 | } | |
354 | if(isFinal) { | |
355 | return TRUE; | |
356 | } | |
357 | pos=skipNodeValue(pos, node); | |
358 | node&=kNodeTypeMask; | |
359 | } | |
360 | } | |
361 | } | |
362 | ||
363 | int32_t | |
364 | UCharsTrie::getNextUChars(Appendable &out) const { | |
365 | const UChar *pos=pos_; | |
366 | if(pos==NULL) { | |
367 | return 0; | |
368 | } | |
369 | if(remainingMatchLength_>=0) { | |
370 | out.appendCodeUnit(*pos); // Next unit of a pending linear-match node. | |
371 | return 1; | |
372 | } | |
373 | int32_t node=*pos++; | |
374 | if(node>=kMinValueLead) { | |
375 | if(node&kValueIsFinal) { | |
376 | return 0; | |
377 | } else { | |
378 | pos=skipNodeValue(pos, node); | |
379 | node&=kNodeTypeMask; | |
380 | } | |
381 | } | |
382 | if(node<kMinLinearMatch) { | |
383 | if(node==0) { | |
384 | node=*pos++; | |
385 | } | |
386 | out.reserveAppendCapacity(++node); | |
387 | getNextBranchUChars(pos, node, out); | |
388 | return node; | |
389 | } else { | |
390 | // First unit of the linear-match node. | |
391 | out.appendCodeUnit(*pos); | |
392 | return 1; | |
393 | } | |
394 | } | |
395 | ||
396 | void | |
397 | UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) { | |
398 | while(length>kMaxBranchLinearSubNodeLength) { | |
399 | ++pos; // ignore the comparison unit | |
400 | getNextBranchUChars(jumpByDelta(pos), length>>1, out); | |
401 | length=length-(length>>1); | |
402 | pos=skipDelta(pos); | |
403 | } | |
404 | do { | |
405 | out.appendCodeUnit(*pos++); | |
406 | pos=skipValue(pos); | |
407 | } while(--length>1); | |
408 | out.appendCodeUnit(*pos); | |
409 | } | |
410 | ||
411 | U_NAMESPACE_END |