]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2012-2014, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ******************************************************************************* | |
6 | * collationdatabuilder.cpp | |
7 | * | |
8 | * (replaced the former ucol_elm.cpp) | |
9 | * | |
10 | * created on: 2012apr01 | |
11 | * created by: Markus W. Scherer | |
12 | */ | |
13 | ||
14 | #include "unicode/utypes.h" | |
15 | ||
16 | #if !UCONFIG_NO_COLLATION | |
17 | ||
18 | #include "unicode/localpointer.h" | |
19 | #include "unicode/uchar.h" | |
20 | #include "unicode/ucharstrie.h" | |
21 | #include "unicode/ucharstriebuilder.h" | |
22 | #include "unicode/uniset.h" | |
23 | #include "unicode/unistr.h" | |
24 | #include "unicode/usetiter.h" | |
25 | #include "unicode/utf16.h" | |
26 | #include "cmemory.h" | |
27 | #include "collation.h" | |
28 | #include "collationdata.h" | |
29 | #include "collationdatabuilder.h" | |
30 | #include "collationfastlatinbuilder.h" | |
31 | #include "collationiterator.h" | |
32 | #include "normalizer2impl.h" | |
33 | #include "utrie2.h" | |
34 | #include "uvectr32.h" | |
35 | #include "uvectr64.h" | |
36 | #include "uvector.h" | |
37 | ||
38 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
39 | ||
40 | U_NAMESPACE_BEGIN | |
41 | ||
42 | CollationDataBuilder::CEModifier::~CEModifier() {} | |
43 | ||
44 | /** | |
45 | * Build-time context and CE32 for a code point. | |
46 | * If a code point has contextual mappings, then the default (no-context) mapping | |
47 | * and all conditional mappings are stored in a singly-linked list | |
48 | * of ConditionalCE32, sorted by context strings. | |
49 | * | |
50 | * Context strings sort by prefix length, then by prefix, then by contraction suffix. | |
51 | * Context strings must be unique and in ascending order. | |
52 | */ | |
53 | struct ConditionalCE32 : public UMemory { | |
54 | ConditionalCE32(const UnicodeString &ct, uint32_t ce) | |
55 | : context(ct), | |
56 | ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), | |
57 | next(-1) {} | |
58 | ||
59 | inline UBool hasContext() const { return context.length() > 1; } | |
60 | inline int32_t prefixLength() const { return context.charAt(0); } | |
61 | ||
62 | /** | |
63 | * "\0" for the first entry for any code point, with its default CE32. | |
64 | * | |
65 | * Otherwise one unit with the length of the prefix string, | |
66 | * then the prefix string, then the contraction suffix. | |
67 | */ | |
68 | UnicodeString context; | |
69 | /** | |
70 | * CE32 for the code point and its context. | |
71 | * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). | |
72 | */ | |
73 | uint32_t ce32; | |
74 | /** | |
75 | * Default CE32 for all contexts with this same prefix. | |
76 | * Initially NO_CE32. Set only while building runtime data structures, | |
77 | * and only on one of the nodes of a sub-list with the same prefix. | |
78 | */ | |
79 | uint32_t defaultCE32; | |
80 | /** | |
81 | * CE32 for the built contexts. | |
82 | * When fetching CEs from the builder, the contexts are built into their runtime form | |
83 | * so that the normal collation implementation can process them. | |
84 | * The result is cached in the list head. It is reset when the contexts are modified. | |
85 | */ | |
86 | uint32_t builtCE32; | |
87 | /** | |
88 | * Index of the next ConditionalCE32. | |
89 | * Negative for the end of the list. | |
90 | */ | |
91 | int32_t next; | |
92 | }; | |
93 | ||
94 | U_CDECL_BEGIN | |
95 | ||
96 | U_CAPI void U_CALLCONV | |
97 | uprv_deleteConditionalCE32(void *obj) { | |
98 | delete static_cast<ConditionalCE32 *>(obj); | |
99 | } | |
100 | ||
101 | U_CDECL_END | |
102 | ||
103 | /** | |
104 | * Build-time collation element and character iterator. | |
105 | * Uses the runtime CollationIterator for fetching CEs for a string | |
106 | * but reads from the builder's unfinished data structures. | |
107 | * In particular, this class reads from the unfinished trie | |
108 | * and has to avoid CollationIterator::nextCE() and redirect other | |
109 | * calls to data->getCE32() and data->getCE32FromSupplementary(). | |
110 | * | |
111 | * We do this so that we need not implement the collation algorithm | |
112 | * again for the builder and make it behave exactly like the runtime code. | |
113 | * That would be more difficult to test and maintain than this indirection. | |
114 | * | |
115 | * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, | |
116 | * so the data accesses from those code paths need not be modified. | |
117 | * | |
118 | * This class iterates directly over whole code points | |
119 | * so that the CollationIterator does not need the finished trie | |
120 | * for handling the LEAD_SURROGATE_TAG. | |
121 | */ | |
122 | class DataBuilderCollationIterator : public CollationIterator { | |
123 | public: | |
124 | DataBuilderCollationIterator(CollationDataBuilder &b); | |
125 | ||
126 | virtual ~DataBuilderCollationIterator(); | |
127 | ||
128 | int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength); | |
129 | ||
130 | virtual void resetToOffset(int32_t newOffset); | |
131 | virtual int32_t getOffset() const; | |
132 | ||
133 | virtual UChar32 nextCodePoint(UErrorCode &errorCode); | |
134 | virtual UChar32 previousCodePoint(UErrorCode &errorCode); | |
135 | ||
136 | protected: | |
137 | virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode); | |
138 | virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode); | |
139 | ||
140 | virtual uint32_t getDataCE32(UChar32 c) const; | |
141 | virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode); | |
142 | ||
143 | CollationDataBuilder &builder; | |
144 | CollationData builderData; | |
145 | uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; | |
146 | const UnicodeString *s; | |
147 | int32_t pos; | |
148 | }; | |
149 | ||
150 | DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b) | |
151 | : CollationIterator(&builderData, /*numeric=*/ FALSE), | |
152 | builder(b), builderData(b.nfcImpl), | |
153 | s(NULL), pos(0) { | |
154 | builderData.base = builder.base; | |
155 | // Set all of the jamoCE32s[] to indirection CE32s. | |
156 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. | |
157 | UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j); | |
158 | jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) | | |
159 | CollationDataBuilder::IS_BUILDER_JAMO_CE32; | |
160 | } | |
161 | builderData.jamoCE32s = jamoCE32s; | |
162 | } | |
163 | ||
164 | DataBuilderCollationIterator::~DataBuilderCollationIterator() {} | |
165 | ||
166 | int32_t | |
167 | DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start, | |
168 | int64_t ces[], int32_t cesLength) { | |
169 | // Set the pointers each time, in case they changed due to reallocation. | |
170 | builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer()); | |
171 | builderData.ces = builder.ce64s.getBuffer(); | |
172 | builderData.contexts = builder.contexts.getBuffer(); | |
173 | // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32(). | |
174 | reset(); | |
175 | s = &str; | |
176 | pos = start; | |
177 | UErrorCode errorCode = U_ZERO_ERROR; | |
178 | while(U_SUCCESS(errorCode) && pos < s->length()) { | |
179 | // No need to keep all CEs in the iterator buffer. | |
180 | clearCEs(); | |
181 | UChar32 c = s->char32At(pos); | |
182 | pos += U16_LENGTH(c); | |
183 | uint32_t ce32 = utrie2_get32(builder.trie, c); | |
184 | const CollationData *d; | |
185 | if(ce32 == Collation::FALLBACK_CE32) { | |
186 | d = builder.base; | |
187 | ce32 = builder.base->getCE32(c); | |
188 | } else { | |
189 | d = &builderData; | |
190 | } | |
191 | appendCEsFromCE32(d, c, ce32, /*forward=*/ TRUE, errorCode); | |
192 | U_ASSERT(U_SUCCESS(errorCode)); | |
193 | for(int32_t i = 0; i < getCEsLength(); ++i) { | |
194 | int64_t ce = getCE(i); | |
195 | if(ce != 0) { | |
196 | if(cesLength < Collation::MAX_EXPANSION_LENGTH) { | |
197 | ces[cesLength] = ce; | |
198 | } | |
199 | ++cesLength; | |
200 | } | |
201 | } | |
202 | } | |
203 | return cesLength; | |
204 | } | |
205 | ||
206 | void | |
207 | DataBuilderCollationIterator::resetToOffset(int32_t newOffset) { | |
208 | reset(); | |
209 | pos = newOffset; | |
210 | } | |
211 | ||
212 | int32_t | |
213 | DataBuilderCollationIterator::getOffset() const { | |
214 | return pos; | |
215 | } | |
216 | ||
217 | UChar32 | |
218 | DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { | |
219 | if(pos == s->length()) { | |
220 | return U_SENTINEL; | |
221 | } | |
222 | UChar32 c = s->char32At(pos); | |
223 | pos += U16_LENGTH(c); | |
224 | return c; | |
225 | } | |
226 | ||
227 | UChar32 | |
228 | DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { | |
229 | if(pos == 0) { | |
230 | return U_SENTINEL; | |
231 | } | |
232 | UChar32 c = s->char32At(pos - 1); | |
233 | pos -= U16_LENGTH(c); | |
234 | return c; | |
235 | } | |
236 | ||
237 | void | |
238 | DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { | |
239 | pos = s->moveIndex32(pos, num); | |
240 | } | |
241 | ||
242 | void | |
243 | DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { | |
244 | pos = s->moveIndex32(pos, -num); | |
245 | } | |
246 | ||
247 | uint32_t | |
248 | DataBuilderCollationIterator::getDataCE32(UChar32 c) const { | |
249 | return utrie2_get32(builder.trie, c); | |
250 | } | |
251 | ||
252 | uint32_t | |
253 | DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) { | |
254 | U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG)); | |
255 | if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) { | |
256 | UChar32 jamo = Collation::indexFromCE32(ce32); | |
257 | return utrie2_get32(builder.trie, jamo); | |
258 | } else { | |
259 | ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32); | |
260 | if(cond->builtCE32 == Collation::NO_CE32) { | |
261 | // Build the context-sensitive mappings into their runtime form and cache the result. | |
262 | cond->builtCE32 = builder.buildContext(cond, errorCode); | |
263 | if(errorCode == U_BUFFER_OVERFLOW_ERROR) { | |
264 | errorCode = U_ZERO_ERROR; | |
265 | builder.clearContexts(); | |
266 | cond->builtCE32 = builder.buildContext(cond, errorCode); | |
267 | } | |
268 | builderData.contexts = builder.contexts.getBuffer(); | |
269 | } | |
270 | return cond->builtCE32; | |
271 | } | |
272 | } | |
273 | ||
274 | // ------------------------------------------------------------------------- *** | |
275 | ||
276 | CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode) | |
277 | : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), | |
278 | base(NULL), baseSettings(NULL), | |
279 | trie(NULL), | |
280 | ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode), | |
281 | modified(FALSE), | |
282 | fastLatinEnabled(FALSE), fastLatinBuilder(NULL), | |
283 | collIter(NULL) { | |
284 | // Reserve the first CE32 for U+0000. | |
285 | ce32s.addElement(0, errorCode); | |
286 | conditionalCE32s.setDeleter(uprv_deleteConditionalCE32); | |
287 | } | |
288 | ||
289 | CollationDataBuilder::~CollationDataBuilder() { | |
290 | utrie2_close(trie); | |
291 | delete fastLatinBuilder; | |
292 | delete collIter; | |
293 | } | |
294 | ||
295 | void | |
296 | CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) { | |
297 | if(U_FAILURE(errorCode)) { return; } | |
298 | if(trie != NULL) { | |
299 | errorCode = U_INVALID_STATE_ERROR; | |
300 | return; | |
301 | } | |
302 | if(b == NULL) { | |
303 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
304 | return; | |
305 | } | |
306 | base = b; | |
307 | ||
308 | // For a tailoring, the default is to fall back to the base. | |
309 | trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode); | |
310 | ||
311 | // Set the Latin-1 letters block so that it is allocated first in the data array, | |
312 | // to try to improve locality of reference when sorting Latin-1 text. | |
313 | // Do not use utrie2_setRange32() since that will not actually allocate blocks | |
314 | // that are filled with the default value. | |
315 | // ASCII (0..7F) is already preallocated anyway. | |
316 | for(UChar32 c = 0xc0; c <= 0xff; ++c) { | |
317 | utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode); | |
318 | } | |
319 | ||
320 | // Hangul syllables are not tailorable (except via tailoring Jamos). | |
321 | // Always set the Hangul tag to help performance. | |
322 | // Do this here, rather than in buildMappings(), | |
323 | // so that we see the HANGUL_TAG in various assertions. | |
324 | uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); | |
325 | utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode); | |
326 | ||
327 | // Copy the set contents but don't copy/clone the set as a whole because | |
328 | // that would copy the isFrozen state too. | |
329 | unsafeBackwardSet.addAll(*b->unsafeBackwardSet); | |
330 | ||
331 | if(U_FAILURE(errorCode)) { return; } | |
332 | } | |
333 | ||
334 | UBool | |
335 | CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end, | |
336 | uint32_t primary, int32_t step, | |
337 | UErrorCode &errorCode) { | |
338 | if(U_FAILURE(errorCode)) { return FALSE; } | |
339 | U_ASSERT(start <= end); | |
340 | // TODO: Do we need to check what values are currently set for start..end? | |
341 | // An offset range is worth it only if we can achieve an overlap between | |
342 | // adjacent UTrie2 blocks of 32 code points each. | |
343 | // An offset CE is also a little more expensive to look up and compute | |
344 | // than a simple CE. | |
345 | // If the range spans at least three UTrie2 block boundaries (> 64 code points), | |
346 | // then we take it. | |
347 | // If the range spans one or two block boundaries and there are | |
348 | // at least 4 code points on either side, then we take it. | |
349 | // (We could additionally require a minimum range length of, say, 16.) | |
350 | int32_t blockDelta = (end >> 5) - (start >> 5); | |
351 | if(2 <= step && step <= 0x7f && | |
352 | (blockDelta >= 3 || | |
353 | (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) { | |
354 | int64_t dataCE = ((int64_t)primary << 32) | (start << 8) | step; | |
355 | if(isCompressiblePrimary(primary)) { dataCE |= 0x80; } | |
356 | int32_t index = addCE(dataCE, errorCode); | |
357 | if(U_FAILURE(errorCode)) { return 0; } | |
358 | if(index > Collation::MAX_INDEX) { | |
359 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
360 | return 0; | |
361 | } | |
362 | uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index); | |
363 | utrie2_setRange32(trie, start, end, offsetCE32, TRUE, &errorCode); | |
364 | modified = TRUE; | |
365 | return TRUE; | |
366 | } else { | |
367 | return FALSE; | |
368 | } | |
369 | } | |
370 | ||
371 | uint32_t | |
372 | CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, | |
373 | uint32_t primary, int32_t step, | |
374 | UErrorCode &errorCode) { | |
375 | if(U_FAILURE(errorCode)) { return 0; } | |
376 | UBool isCompressible = isCompressiblePrimary(primary); | |
377 | if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) { | |
378 | return Collation::incThreeBytePrimaryByOffset(primary, isCompressible, | |
379 | (end - start + 1) * step); | |
380 | } else { | |
381 | // Short range: Set individual CE32s. | |
382 | for(;;) { | |
383 | utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode); | |
384 | ++start; | |
385 | primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step); | |
386 | if(start > end) { return primary; } | |
387 | } | |
388 | modified = TRUE; | |
389 | } | |
390 | } | |
391 | ||
392 | uint32_t | |
393 | CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const { | |
394 | int32_t i = Collation::indexFromCE32(ce32); | |
395 | int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i); | |
396 | uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); | |
397 | return Collation::makeLongPrimaryCE32(p); | |
398 | } | |
399 | ||
400 | UBool | |
401 | CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const { | |
402 | return base->isCompressibleLeadByte(b); | |
403 | } | |
404 | ||
405 | UBool | |
406 | CollationDataBuilder::isAssigned(UChar32 c) const { | |
407 | return Collation::isAssignedCE32(utrie2_get32(trie, c)); | |
408 | } | |
409 | ||
410 | uint32_t | |
411 | CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const { | |
412 | uint32_t ce32 = utrie2_get32(trie, c); | |
413 | if(Collation::isLongPrimaryCE32(ce32)) { | |
414 | return Collation::primaryFromLongPrimaryCE32(ce32); | |
415 | } else { | |
416 | return 0; | |
417 | } | |
418 | } | |
419 | ||
420 | int64_t | |
421 | CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const { | |
422 | if(U_FAILURE(errorCode)) { return 0; } | |
423 | UBool fromBase = FALSE; | |
424 | uint32_t ce32 = utrie2_get32(trie, c); | |
425 | if(ce32 == Collation::FALLBACK_CE32) { | |
426 | fromBase = TRUE; | |
427 | ce32 = base->getCE32(c); | |
428 | } | |
429 | while(Collation::isSpecialCE32(ce32)) { | |
430 | switch(Collation::tagFromCE32(ce32)) { | |
431 | case Collation::LATIN_EXPANSION_TAG: | |
432 | case Collation::BUILDER_DATA_TAG: | |
433 | case Collation::PREFIX_TAG: | |
434 | case Collation::CONTRACTION_TAG: | |
435 | case Collation::HANGUL_TAG: | |
436 | case Collation::LEAD_SURROGATE_TAG: | |
437 | errorCode = U_UNSUPPORTED_ERROR; | |
438 | return 0; | |
439 | case Collation::FALLBACK_TAG: | |
440 | case Collation::RESERVED_TAG_3: | |
441 | errorCode = U_INTERNAL_PROGRAM_ERROR; | |
442 | return 0; | |
443 | case Collation::LONG_PRIMARY_TAG: | |
444 | return Collation::ceFromLongPrimaryCE32(ce32); | |
445 | case Collation::LONG_SECONDARY_TAG: | |
446 | return Collation::ceFromLongSecondaryCE32(ce32); | |
447 | case Collation::EXPANSION32_TAG: | |
448 | if(Collation::lengthFromCE32(ce32) == 1) { | |
449 | int32_t i = Collation::indexFromCE32(ce32); | |
450 | ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i); | |
451 | break; | |
452 | } else { | |
453 | errorCode = U_UNSUPPORTED_ERROR; | |
454 | return 0; | |
455 | } | |
456 | case Collation::EXPANSION_TAG: { | |
457 | if(Collation::lengthFromCE32(ce32) == 1) { | |
458 | int32_t i = Collation::indexFromCE32(ce32); | |
459 | return fromBase ? base->ces[i] : ce64s.elementAti(i); | |
460 | } else { | |
461 | errorCode = U_UNSUPPORTED_ERROR; | |
462 | return 0; | |
463 | } | |
464 | } | |
465 | case Collation::DIGIT_TAG: | |
466 | // Fetch the non-numeric-collation CE32 and continue. | |
467 | ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32)); | |
468 | break; | |
469 | case Collation::U0000_TAG: | |
470 | U_ASSERT(c == 0); | |
471 | // Fetch the normal ce32 for U+0000 and continue. | |
472 | ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0); | |
473 | break; | |
474 | case Collation::OFFSET_TAG: | |
475 | ce32 = getCE32FromOffsetCE32(fromBase, c, ce32); | |
476 | break; | |
477 | case Collation::IMPLICIT_TAG: | |
478 | return Collation::unassignedCEFromCodePoint(c); | |
479 | } | |
480 | } | |
481 | return Collation::ceFromSimpleCE32(ce32); | |
482 | } | |
483 | ||
484 | int32_t | |
485 | CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) { | |
486 | int32_t length = ce64s.size(); | |
487 | for(int32_t i = 0; i < length; ++i) { | |
488 | if(ce == ce64s.elementAti(i)) { return i; } | |
489 | } | |
490 | ce64s.addElement(ce, errorCode); | |
491 | return length; | |
492 | } | |
493 | ||
494 | int32_t | |
495 | CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) { | |
496 | int32_t length = ce32s.size(); | |
497 | for(int32_t i = 0; i < length; ++i) { | |
498 | if(ce32 == (uint32_t)ce32s.elementAti(i)) { return i; } | |
499 | } | |
500 | ce32s.addElement((int32_t)ce32, errorCode); | |
501 | return length; | |
502 | } | |
503 | ||
504 | int32_t | |
505 | CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32, | |
506 | UErrorCode &errorCode) { | |
507 | if(U_FAILURE(errorCode)) { return -1; } | |
508 | U_ASSERT(!context.isEmpty()); | |
509 | int32_t index = conditionalCE32s.size(); | |
510 | if(index > Collation::MAX_INDEX) { | |
511 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
512 | return -1; | |
513 | } | |
514 | ConditionalCE32 *cond = new ConditionalCE32(context, ce32); | |
515 | if(cond == NULL) { | |
516 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
517 | return -1; | |
518 | } | |
519 | conditionalCE32s.addElement(cond, errorCode); | |
520 | return index; | |
521 | } | |
522 | ||
523 | void | |
524 | CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s, | |
525 | const int64_t ces[], int32_t cesLength, | |
526 | UErrorCode &errorCode) { | |
527 | uint32_t ce32 = encodeCEs(ces, cesLength, errorCode); | |
528 | addCE32(prefix, s, ce32, errorCode); | |
529 | } | |
530 | ||
531 | void | |
532 | CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s, | |
533 | uint32_t ce32, UErrorCode &errorCode) { | |
534 | if(U_FAILURE(errorCode)) { return; } | |
535 | if(s.isEmpty()) { | |
536 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
537 | return; | |
538 | } | |
539 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
540 | errorCode = U_INVALID_STATE_ERROR; | |
541 | return; | |
542 | } | |
543 | UChar32 c = s.char32At(0); | |
544 | int32_t cLength = U16_LENGTH(c); | |
545 | uint32_t oldCE32 = utrie2_get32(trie, c); | |
546 | UBool hasContext = !prefix.isEmpty() || s.length() > cLength; | |
547 | if(oldCE32 == Collation::FALLBACK_CE32) { | |
548 | // First tailoring for c. | |
549 | // If c has contextual base mappings or if we add a contextual mapping, | |
550 | // then copy the base mappings. | |
551 | // Otherwise we just override the base mapping. | |
552 | uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c)); | |
553 | if(hasContext || Collation::ce32HasContext(baseCE32)) { | |
554 | oldCE32 = copyFromBaseCE32(c, baseCE32, TRUE, errorCode); | |
555 | utrie2_set32(trie, c, oldCE32, &errorCode); | |
556 | if(U_FAILURE(errorCode)) { return; } | |
557 | } | |
558 | } | |
559 | if(!hasContext) { | |
560 | // No prefix, no contraction. | |
561 | if(!isBuilderContextCE32(oldCE32)) { | |
562 | utrie2_set32(trie, c, ce32, &errorCode); | |
563 | } else { | |
564 | ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32); | |
565 | cond->builtCE32 = Collation::NO_CE32; | |
566 | cond->ce32 = ce32; | |
567 | } | |
568 | } else { | |
569 | ConditionalCE32 *cond; | |
570 | if(!isBuilderContextCE32(oldCE32)) { | |
571 | // Replace the simple oldCE32 with a builder context CE32 | |
572 | // pointing to a new ConditionalCE32 list head. | |
573 | int32_t index = addConditionalCE32(UnicodeString((UChar)0), oldCE32, errorCode); | |
574 | if(U_FAILURE(errorCode)) { return; } | |
575 | uint32_t contextCE32 = makeBuilderContextCE32(index); | |
576 | utrie2_set32(trie, c, contextCE32, &errorCode); | |
577 | contextChars.add(c); | |
578 | cond = getConditionalCE32(index); | |
579 | } else { | |
580 | cond = getConditionalCE32ForCE32(oldCE32); | |
581 | cond->builtCE32 = Collation::NO_CE32; | |
582 | } | |
583 | UnicodeString suffix(s, cLength); | |
584 | UnicodeString context((UChar)prefix.length()); | |
585 | context.append(prefix).append(suffix); | |
586 | unsafeBackwardSet.addAll(suffix); | |
587 | for(;;) { | |
588 | // invariant: context > cond->context | |
589 | int32_t next = cond->next; | |
590 | if(next < 0) { | |
591 | // Append a new ConditionalCE32 after cond. | |
592 | int32_t index = addConditionalCE32(context, ce32, errorCode); | |
593 | if(U_FAILURE(errorCode)) { return; } | |
594 | cond->next = index; | |
595 | break; | |
596 | } | |
597 | ConditionalCE32 *nextCond = getConditionalCE32(next); | |
598 | int8_t cmp = context.compare(nextCond->context); | |
599 | if(cmp < 0) { | |
600 | // Insert a new ConditionalCE32 between cond and nextCond. | |
601 | int32_t index = addConditionalCE32(context, ce32, errorCode); | |
602 | if(U_FAILURE(errorCode)) { return; } | |
603 | cond->next = index; | |
604 | getConditionalCE32(index)->next = next; | |
605 | break; | |
606 | } else if(cmp == 0) { | |
607 | // Same context as before, overwrite its ce32. | |
608 | nextCond->ce32 = ce32; | |
609 | break; | |
610 | } | |
611 | cond = nextCond; | |
612 | } | |
613 | } | |
614 | modified = TRUE; | |
615 | } | |
616 | ||
617 | uint32_t | |
618 | CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) { | |
619 | uint32_t p = (uint32_t)(ce >> 32); | |
620 | uint32_t lower32 = (uint32_t)ce; | |
621 | uint32_t t = (uint32_t)(ce & 0xffff); | |
622 | U_ASSERT((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. | |
623 | if((ce & INT64_C(0xffff00ff00ff)) == 0) { | |
624 | // normal form ppppsstt | |
625 | return p | (lower32 >> 16) | (t >> 8); | |
626 | } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) { | |
627 | // long-primary form ppppppC1 | |
628 | return Collation::makeLongPrimaryCE32(p); | |
629 | } else if(p == 0 && (t & 0xff) == 0) { | |
630 | // long-secondary form ssssttC2 | |
631 | return Collation::makeLongSecondaryCE32(lower32); | |
632 | } | |
633 | return Collation::NO_CE32; | |
634 | } | |
635 | ||
636 | uint32_t | |
637 | CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) { | |
638 | // Try to encode one CE as one CE32. | |
639 | uint32_t ce32 = encodeOneCEAsCE32(ce); | |
640 | if(ce32 != Collation::NO_CE32) { return ce32; } | |
641 | int32_t index = addCE(ce, errorCode); | |
642 | if(U_FAILURE(errorCode)) { return 0; } | |
643 | if(index > Collation::MAX_INDEX) { | |
644 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
645 | return 0; | |
646 | } | |
647 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1); | |
648 | } | |
649 | ||
650 | uint32_t | |
651 | CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, | |
652 | UErrorCode &errorCode) { | |
653 | if(U_FAILURE(errorCode)) { return 0; } | |
654 | if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) { | |
655 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
656 | return 0; | |
657 | } | |
658 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
659 | errorCode = U_INVALID_STATE_ERROR; | |
660 | return 0; | |
661 | } | |
662 | if(cesLength == 0) { | |
663 | // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. | |
664 | // Do this here so that callers need not do it. | |
665 | return encodeOneCEAsCE32(0); | |
666 | } else if(cesLength == 1) { | |
667 | return encodeOneCE(ces[0], errorCode); | |
668 | } else if(cesLength == 2) { | |
669 | // Try to encode two CEs as one CE32. | |
670 | int64_t ce0 = ces[0]; | |
671 | int64_t ce1 = ces[1]; | |
672 | uint32_t p0 = (uint32_t)(ce0 >> 32); | |
673 | if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE && | |
674 | (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE && | |
675 | p0 != 0) { | |
676 | // Latin mini expansion | |
677 | return | |
678 | p0 | | |
679 | (((uint32_t)ce0 & 0xff00u) << 8) | | |
680 | (uint32_t)(ce1 >> 16) | | |
681 | Collation::SPECIAL_CE32_LOW_BYTE | | |
682 | Collation::LATIN_EXPANSION_TAG; | |
683 | } | |
684 | } | |
685 | // Try to encode two or more CEs as CE32s. | |
686 | int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH]; | |
687 | for(int32_t i = 0;; ++i) { | |
688 | if(i == cesLength) { | |
689 | return encodeExpansion32(newCE32s, cesLength, errorCode); | |
690 | } | |
691 | uint32_t ce32 = encodeOneCEAsCE32(ces[i]); | |
692 | if(ce32 == Collation::NO_CE32) { break; } | |
693 | newCE32s[i] = (int32_t)ce32; | |
694 | } | |
695 | return encodeExpansion(ces, cesLength, errorCode); | |
696 | } | |
697 | ||
698 | uint32_t | |
699 | CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) { | |
700 | if(U_FAILURE(errorCode)) { return 0; } | |
701 | // See if this sequence of CEs has already been stored. | |
702 | int64_t first = ces[0]; | |
703 | int32_t ce64sMax = ce64s.size() - length; | |
704 | for(int32_t i = 0; i <= ce64sMax; ++i) { | |
705 | if(first == ce64s.elementAti(i)) { | |
706 | if(i > Collation::MAX_INDEX) { | |
707 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
708 | return 0; | |
709 | } | |
710 | for(int32_t j = 1;; ++j) { | |
711 | if(j == length) { | |
712 | return Collation::makeCE32FromTagIndexAndLength( | |
713 | Collation::EXPANSION_TAG, i, length); | |
714 | } | |
715 | if(ce64s.elementAti(i + j) != ces[j]) { break; } | |
716 | } | |
717 | } | |
718 | } | |
719 | // Store the new sequence. | |
720 | int32_t i = ce64s.size(); | |
721 | if(i > Collation::MAX_INDEX) { | |
722 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
723 | return 0; | |
724 | } | |
725 | for(int32_t j = 0; j < length; ++j) { | |
726 | ce64s.addElement(ces[j], errorCode); | |
727 | } | |
728 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length); | |
729 | } | |
730 | ||
731 | uint32_t | |
732 | CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length, | |
733 | UErrorCode &errorCode) { | |
734 | if(U_FAILURE(errorCode)) { return 0; } | |
735 | // See if this sequence of CE32s has already been stored. | |
736 | int32_t first = newCE32s[0]; | |
737 | int32_t ce32sMax = ce32s.size() - length; | |
738 | for(int32_t i = 0; i <= ce32sMax; ++i) { | |
739 | if(first == ce32s.elementAti(i)) { | |
740 | if(i > Collation::MAX_INDEX) { | |
741 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
742 | return 0; | |
743 | } | |
744 | for(int32_t j = 1;; ++j) { | |
745 | if(j == length) { | |
746 | return Collation::makeCE32FromTagIndexAndLength( | |
747 | Collation::EXPANSION32_TAG, i, length); | |
748 | } | |
749 | if(ce32s.elementAti(i + j) != newCE32s[j]) { break; } | |
750 | } | |
751 | } | |
752 | } | |
753 | // Store the new sequence. | |
754 | int32_t i = ce32s.size(); | |
755 | if(i > Collation::MAX_INDEX) { | |
756 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
757 | return 0; | |
758 | } | |
759 | for(int32_t j = 0; j < length; ++j) { | |
760 | ce32s.addElement(newCE32s[j], errorCode); | |
761 | } | |
762 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length); | |
763 | } | |
764 | ||
765 | uint32_t | |
766 | CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, | |
767 | UErrorCode &errorCode) { | |
768 | if(U_FAILURE(errorCode)) { return 0; } | |
769 | if(!Collation::isSpecialCE32(ce32)) { return ce32; } | |
770 | switch(Collation::tagFromCE32(ce32)) { | |
771 | case Collation::LONG_PRIMARY_TAG: | |
772 | case Collation::LONG_SECONDARY_TAG: | |
773 | case Collation::LATIN_EXPANSION_TAG: | |
774 | // copy as is | |
775 | break; | |
776 | case Collation::EXPANSION32_TAG: { | |
777 | const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32); | |
778 | int32_t length = Collation::lengthFromCE32(ce32); | |
779 | ce32 = encodeExpansion32( | |
780 | reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode); | |
781 | break; | |
782 | } | |
783 | case Collation::EXPANSION_TAG: { | |
784 | const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32); | |
785 | int32_t length = Collation::lengthFromCE32(ce32); | |
786 | ce32 = encodeExpansion(baseCEs, length, errorCode); | |
787 | break; | |
788 | } | |
789 | case Collation::PREFIX_TAG: { | |
790 | // Flatten prefixes and nested suffixes (contractions) | |
791 | // into a linear list of ConditionalCE32. | |
792 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); | |
793 | ce32 = CollationData::readCE32(p); // Default if no prefix match. | |
794 | if(!withContext) { | |
795 | return copyFromBaseCE32(c, ce32, FALSE, errorCode); | |
796 | } | |
797 | ConditionalCE32 head(UnicodeString(), 0); | |
798 | UnicodeString context((UChar)0); | |
799 | int32_t index; | |
800 | if(Collation::isContractionCE32(ce32)) { | |
801 | index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); | |
802 | } else { | |
803 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
804 | head.next = index = addConditionalCE32(context, ce32, errorCode); | |
805 | } | |
806 | if(U_FAILURE(errorCode)) { return 0; } | |
807 | ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far | |
808 | UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); | |
809 | while(prefixes.next(errorCode)) { | |
810 | context = prefixes.getString(); | |
811 | context.reverse(); | |
812 | context.insert(0, (UChar)context.length()); | |
813 | ce32 = (uint32_t)prefixes.getValue(); | |
814 | if(Collation::isContractionCE32(ce32)) { | |
815 | index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode); | |
816 | } else { | |
817 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
818 | cond->next = index = addConditionalCE32(context, ce32, errorCode); | |
819 | } | |
820 | if(U_FAILURE(errorCode)) { return 0; } | |
821 | cond = getConditionalCE32(index); | |
822 | } | |
823 | ce32 = makeBuilderContextCE32(head.next); | |
824 | contextChars.add(c); | |
825 | break; | |
826 | } | |
827 | case Collation::CONTRACTION_TAG: { | |
828 | if(!withContext) { | |
829 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); | |
830 | ce32 = CollationData::readCE32(p); // Default if no suffix match. | |
831 | return copyFromBaseCE32(c, ce32, FALSE, errorCode); | |
832 | } | |
833 | ConditionalCE32 head(UnicodeString(), 0); | |
834 | UnicodeString context((UChar)0); | |
835 | copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); | |
836 | ce32 = makeBuilderContextCE32(head.next); | |
837 | contextChars.add(c); | |
838 | break; | |
839 | } | |
840 | case Collation::HANGUL_TAG: | |
841 | errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables. | |
842 | break; | |
843 | case Collation::OFFSET_TAG: | |
844 | ce32 = getCE32FromOffsetCE32(TRUE, c, ce32); | |
845 | break; | |
846 | case Collation::IMPLICIT_TAG: | |
847 | ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode); | |
848 | break; | |
849 | default: | |
850 | U_ASSERT(FALSE); // require ce32 == base->getFinalCE32(ce32) | |
851 | break; | |
852 | } | |
853 | return ce32; | |
854 | } | |
855 | ||
856 | int32_t | |
857 | CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, | |
858 | ConditionalCE32 *cond, UErrorCode &errorCode) { | |
859 | if(U_FAILURE(errorCode)) { return 0; } | |
860 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); | |
861 | int32_t index; | |
862 | if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { | |
863 | // No match on the single code point. | |
864 | // We are underneath a prefix, and the default mapping is just | |
865 | // a fallback to the mappings for a shorter prefix. | |
866 | U_ASSERT(context.length() > 1); | |
867 | index = -1; | |
868 | } else { | |
869 | ce32 = CollationData::readCE32(p); // Default if no suffix match. | |
870 | U_ASSERT(!Collation::isContractionCE32(ce32)); | |
871 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
872 | cond->next = index = addConditionalCE32(context, ce32, errorCode); | |
873 | if(U_FAILURE(errorCode)) { return 0; } | |
874 | cond = getConditionalCE32(index); | |
875 | } | |
876 | ||
877 | int32_t suffixStart = context.length(); | |
878 | UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); | |
879 | while(suffixes.next(errorCode)) { | |
880 | context.append(suffixes.getString()); | |
881 | ce32 = copyFromBaseCE32(c, (uint32_t)suffixes.getValue(), TRUE, errorCode); | |
882 | cond->next = index = addConditionalCE32(context, ce32, errorCode); | |
883 | if(U_FAILURE(errorCode)) { return 0; } | |
884 | // No need to update the unsafeBackwardSet because the tailoring set | |
885 | // is already a copy of the base set. | |
886 | cond = getConditionalCE32(index); | |
887 | context.truncate(suffixStart); | |
888 | } | |
889 | U_ASSERT(index >= 0); | |
890 | return index; | |
891 | } | |
892 | ||
893 | class CopyHelper { | |
894 | public: | |
895 | CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d, | |
896 | const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode) | |
897 | : src(s), dest(d), modifier(m), | |
898 | errorCode(initialErrorCode) {} | |
899 | ||
900 | UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) { | |
901 | ce32 = copyCE32(ce32); | |
902 | utrie2_setRange32(dest.trie, start, end, ce32, TRUE, &errorCode); | |
903 | if(CollationDataBuilder::isBuilderContextCE32(ce32)) { | |
904 | dest.contextChars.add(start, end); | |
905 | } | |
906 | return U_SUCCESS(errorCode); | |
907 | } | |
908 | ||
909 | uint32_t copyCE32(uint32_t ce32) { | |
910 | if(!Collation::isSpecialCE32(ce32)) { | |
911 | int64_t ce = modifier.modifyCE32(ce32); | |
912 | if(ce != Collation::NO_CE) { | |
913 | ce32 = dest.encodeOneCE(ce, errorCode); | |
914 | } | |
915 | } else { | |
916 | int32_t tag = Collation::tagFromCE32(ce32); | |
917 | if(tag == Collation::EXPANSION32_TAG) { | |
918 | const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer()); | |
919 | srcCE32s += Collation::indexFromCE32(ce32); | |
920 | int32_t length = Collation::lengthFromCE32(ce32); | |
921 | // Inspect the source CE32s. Just copy them if none are modified. | |
922 | // Otherwise copy to modifiedCEs, with modifications. | |
923 | UBool isModified = FALSE; | |
924 | for(int32_t i = 0; i < length; ++i) { | |
925 | ce32 = srcCE32s[i]; | |
926 | int64_t ce; | |
927 | if(Collation::isSpecialCE32(ce32) || | |
928 | (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) { | |
929 | if(isModified) { | |
930 | modifiedCEs[i] = Collation::ceFromCE32(ce32); | |
931 | } | |
932 | } else { | |
933 | if(!isModified) { | |
934 | for(int32_t j = 0; j < i; ++j) { | |
935 | modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]); | |
936 | } | |
937 | isModified = TRUE; | |
938 | } | |
939 | modifiedCEs[i] = ce; | |
940 | } | |
941 | } | |
942 | if(isModified) { | |
943 | ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); | |
944 | } else { | |
945 | ce32 = dest.encodeExpansion32( | |
946 | reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode); | |
947 | } | |
948 | } else if(tag == Collation::EXPANSION_TAG) { | |
949 | const int64_t *srcCEs = src.ce64s.getBuffer(); | |
950 | srcCEs += Collation::indexFromCE32(ce32); | |
951 | int32_t length = Collation::lengthFromCE32(ce32); | |
952 | // Inspect the source CEs. Just copy them if none are modified. | |
953 | // Otherwise copy to modifiedCEs, with modifications. | |
954 | UBool isModified = FALSE; | |
955 | for(int32_t i = 0; i < length; ++i) { | |
956 | int64_t srcCE = srcCEs[i]; | |
957 | int64_t ce = modifier.modifyCE(srcCE); | |
958 | if(ce == Collation::NO_CE) { | |
959 | if(isModified) { | |
960 | modifiedCEs[i] = srcCE; | |
961 | } | |
962 | } else { | |
963 | if(!isModified) { | |
964 | for(int32_t j = 0; j < i; ++j) { | |
965 | modifiedCEs[j] = srcCEs[j]; | |
966 | } | |
967 | isModified = TRUE; | |
968 | } | |
969 | modifiedCEs[i] = ce; | |
970 | } | |
971 | } | |
972 | if(isModified) { | |
973 | ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); | |
974 | } else { | |
975 | ce32 = dest.encodeExpansion(srcCEs, length, errorCode); | |
976 | } | |
977 | } else if(tag == Collation::BUILDER_DATA_TAG) { | |
978 | // Copy the list of ConditionalCE32. | |
979 | ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32); | |
980 | U_ASSERT(!cond->hasContext()); | |
981 | int32_t destIndex = dest.addConditionalCE32( | |
982 | cond->context, copyCE32(cond->ce32), errorCode); | |
983 | ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex); | |
984 | while(cond->next >= 0) { | |
985 | cond = src.getConditionalCE32(cond->next); | |
986 | ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex); | |
987 | destIndex = dest.addConditionalCE32( | |
988 | cond->context, copyCE32(cond->ce32), errorCode); | |
989 | int32_t suffixStart = cond->prefixLength() + 1; | |
990 | dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart)); | |
991 | prevDestCond->next = destIndex; | |
992 | } | |
993 | } else { | |
994 | // Just copy long CEs and Latin mini expansions (and other expected values) as is, | |
995 | // assuming that the modifier would not modify them. | |
996 | U_ASSERT(tag == Collation::LONG_PRIMARY_TAG || | |
997 | tag == Collation::LONG_SECONDARY_TAG || | |
998 | tag == Collation::LATIN_EXPANSION_TAG || | |
999 | tag == Collation::HANGUL_TAG); | |
1000 | } | |
1001 | } | |
1002 | return ce32; | |
1003 | } | |
1004 | ||
1005 | const CollationDataBuilder &src; | |
1006 | CollationDataBuilder &dest; | |
1007 | const CollationDataBuilder::CEModifier &modifier; | |
1008 | int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH]; | |
1009 | UErrorCode errorCode; | |
1010 | }; | |
1011 | ||
1012 | U_CDECL_BEGIN | |
1013 | ||
1014 | static UBool U_CALLCONV | |
1015 | enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) { | |
1016 | return | |
1017 | value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 || | |
1018 | ((CopyHelper *)context)->copyRangeCE32(start, end, value); | |
1019 | } | |
1020 | ||
1021 | U_CDECL_END | |
1022 | ||
1023 | void | |
1024 | CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, | |
1025 | UErrorCode &errorCode) { | |
1026 | if(U_FAILURE(errorCode)) { return; } | |
1027 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
1028 | errorCode = U_INVALID_STATE_ERROR; | |
1029 | return; | |
1030 | } | |
1031 | CopyHelper helper(src, *this, modifier, errorCode); | |
1032 | utrie2_enum(src.trie, NULL, enumRangeForCopy, &helper); | |
1033 | errorCode = helper.errorCode; | |
1034 | // Update the contextChars and the unsafeBackwardSet while copying, | |
1035 | // in case a character had conditional mappings in the source builder | |
1036 | // and they were removed later. | |
1037 | modified |= src.modified; | |
1038 | } | |
1039 | ||
1040 | void | |
1041 | CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) { | |
1042 | if(U_FAILURE(errorCode) || set.isEmpty()) { return; } | |
1043 | UnicodeSetIterator iter(set); | |
1044 | while(iter.next() && !iter.isString()) { | |
1045 | UChar32 c = iter.getCodepoint(); | |
1046 | uint32_t ce32 = utrie2_get32(trie, c); | |
1047 | if(ce32 == Collation::FALLBACK_CE32) { | |
1048 | ce32 = base->getFinalCE32(base->getCE32(c)); | |
1049 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
1050 | utrie2_set32(trie, c, ce32, &errorCode); | |
1051 | } | |
1052 | } | |
1053 | modified = TRUE; | |
1054 | } | |
1055 | ||
1056 | void | |
1057 | CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) { | |
1058 | if(U_FAILURE(errorCode) || set.isEmpty()) { return; } | |
1059 | UnicodeSetIterator iter(set); | |
1060 | while(iter.next() && !iter.isString()) { | |
1061 | UChar32 c = iter.getCodepoint(); | |
1062 | uint32_t ce32 = utrie2_get32(trie, c); | |
1063 | if(ce32 == Collation::FALLBACK_CE32) { | |
1064 | ce32 = base->getFinalCE32(base->getCE32(c)); | |
1065 | if(Collation::ce32HasContext(ce32)) { | |
1066 | ce32 = copyFromBaseCE32(c, ce32, FALSE /* without context */, errorCode); | |
1067 | utrie2_set32(trie, c, ce32, &errorCode); | |
1068 | } | |
1069 | } else if(isBuilderContextCE32(ce32)) { | |
1070 | ce32 = getConditionalCE32ForCE32(ce32)->ce32; | |
1071 | // Simply abandon the list of ConditionalCE32. | |
1072 | // The caller will copy this builder in the end, | |
1073 | // eliminating unreachable data. | |
1074 | utrie2_set32(trie, c, ce32, &errorCode); | |
1075 | contextChars.remove(c); | |
1076 | } | |
1077 | } | |
1078 | modified = TRUE; | |
1079 | } | |
1080 | ||
1081 | UBool | |
1082 | CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) { | |
1083 | if(U_FAILURE(errorCode)) { return FALSE; } | |
1084 | UBool anyJamoAssigned = base == NULL; // always set jamoCE32s in the base data | |
1085 | UBool needToCopyFromBase = FALSE; | |
1086 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. | |
1087 | UChar32 jamo = jamoCpFromIndex(j); | |
1088 | UBool fromBase = FALSE; | |
1089 | uint32_t ce32 = utrie2_get32(trie, jamo); | |
1090 | anyJamoAssigned |= Collation::isAssignedCE32(ce32); | |
1091 | // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. | |
1092 | // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) | |
1093 | if(ce32 == Collation::FALLBACK_CE32) { | |
1094 | fromBase = TRUE; | |
1095 | ce32 = base->getCE32(jamo); | |
1096 | } | |
1097 | if(Collation::isSpecialCE32(ce32)) { | |
1098 | switch(Collation::tagFromCE32(ce32)) { | |
1099 | case Collation::LONG_PRIMARY_TAG: | |
1100 | case Collation::LONG_SECONDARY_TAG: | |
1101 | case Collation::LATIN_EXPANSION_TAG: | |
1102 | // Copy the ce32 as-is. | |
1103 | break; | |
1104 | case Collation::EXPANSION32_TAG: | |
1105 | case Collation::EXPANSION_TAG: | |
1106 | case Collation::PREFIX_TAG: | |
1107 | case Collation::CONTRACTION_TAG: | |
1108 | if(fromBase) { | |
1109 | // Defer copying until we know if anyJamoAssigned. | |
1110 | ce32 = Collation::FALLBACK_CE32; | |
1111 | needToCopyFromBase = TRUE; | |
1112 | } | |
1113 | break; | |
1114 | case Collation::IMPLICIT_TAG: | |
1115 | // An unassigned Jamo should only occur in tests with incomplete bases. | |
1116 | U_ASSERT(fromBase); | |
1117 | ce32 = Collation::FALLBACK_CE32; | |
1118 | needToCopyFromBase = TRUE; | |
1119 | break; | |
1120 | case Collation::OFFSET_TAG: | |
1121 | ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); | |
1122 | break; | |
1123 | case Collation::FALLBACK_TAG: | |
1124 | case Collation::RESERVED_TAG_3: | |
1125 | case Collation::BUILDER_DATA_TAG: | |
1126 | case Collation::DIGIT_TAG: | |
1127 | case Collation::U0000_TAG: | |
1128 | case Collation::HANGUL_TAG: | |
1129 | case Collation::LEAD_SURROGATE_TAG: | |
1130 | errorCode = U_INTERNAL_PROGRAM_ERROR; | |
1131 | return FALSE; | |
1132 | } | |
1133 | } | |
1134 | jamoCE32s[j] = ce32; | |
1135 | } | |
1136 | if(anyJamoAssigned && needToCopyFromBase) { | |
1137 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { | |
1138 | if(jamoCE32s[j] == Collation::FALLBACK_CE32) { | |
1139 | UChar32 jamo = jamoCpFromIndex(j); | |
1140 | jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo), | |
1141 | /*withContext=*/ TRUE, errorCode); | |
1142 | } | |
1143 | } | |
1144 | } | |
1145 | return anyJamoAssigned && U_SUCCESS(errorCode); | |
1146 | } | |
1147 | ||
1148 | void | |
1149 | CollationDataBuilder::setDigitTags(UErrorCode &errorCode) { | |
1150 | UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode); | |
1151 | if(U_FAILURE(errorCode)) { return; } | |
1152 | UnicodeSetIterator iter(digits); | |
1153 | while(iter.next()) { | |
1154 | U_ASSERT(!iter.isString()); | |
1155 | UChar32 c = iter.getCodepoint(); | |
1156 | uint32_t ce32 = utrie2_get32(trie, c); | |
1157 | if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) { | |
1158 | int32_t index = addCE32(ce32, errorCode); | |
1159 | if(U_FAILURE(errorCode)) { return; } | |
1160 | if(index > Collation::MAX_INDEX) { | |
1161 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
1162 | return; | |
1163 | } | |
1164 | ce32 = Collation::makeCE32FromTagIndexAndLength( | |
1165 | Collation::DIGIT_TAG, index, u_charDigitValue(c)); | |
1166 | utrie2_set32(trie, c, ce32, &errorCode); | |
1167 | } | |
1168 | } | |
1169 | } | |
1170 | ||
1171 | U_CDECL_BEGIN | |
1172 | ||
1173 | static UBool U_CALLCONV | |
1174 | enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { | |
1175 | int32_t *pValue = (int32_t *)context; | |
1176 | if(value == Collation::UNASSIGNED_CE32) { | |
1177 | value = Collation::LEAD_ALL_UNASSIGNED; | |
1178 | } else if(value == Collation::FALLBACK_CE32) { | |
1179 | value = Collation::LEAD_ALL_FALLBACK; | |
1180 | } else { | |
1181 | *pValue = Collation::LEAD_MIXED; | |
1182 | return FALSE; | |
1183 | } | |
1184 | if(*pValue < 0) { | |
1185 | *pValue = (int32_t)value; | |
1186 | } else if(*pValue != (int32_t)value) { | |
1187 | *pValue = Collation::LEAD_MIXED; | |
1188 | return FALSE; | |
1189 | } | |
1190 | return TRUE; | |
1191 | } | |
1192 | ||
1193 | U_CDECL_END | |
1194 | ||
1195 | void | |
1196 | CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) { | |
1197 | for(UChar lead = 0xd800; lead < 0xdc00; ++lead) { | |
1198 | int32_t value = -1; | |
1199 | utrie2_enumForLeadSurrogate(trie, lead, NULL, enumRangeLeadValue, &value); | |
1200 | utrie2_set32ForLeadSurrogateCodeUnit( | |
1201 | trie, lead, | |
1202 | Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | (uint32_t)value, | |
1203 | &errorCode); | |
1204 | } | |
1205 | } | |
1206 | ||
1207 | void | |
1208 | CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) { | |
1209 | buildMappings(data, errorCode); | |
1210 | if(base != NULL) { | |
1211 | data.numericPrimary = base->numericPrimary; | |
1212 | data.compressibleBytes = base->compressibleBytes; | |
1213 | data.scripts = base->scripts; | |
1214 | data.scriptsLength = base->scriptsLength; | |
1215 | } | |
1216 | buildFastLatinTable(data, errorCode); | |
1217 | } | |
1218 | ||
1219 | void | |
1220 | CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) { | |
1221 | if(U_FAILURE(errorCode)) { return; } | |
1222 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
1223 | errorCode = U_INVALID_STATE_ERROR; | |
1224 | return; | |
1225 | } | |
1226 | ||
1227 | buildContexts(errorCode); | |
1228 | ||
1229 | uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; | |
1230 | int32_t jamoIndex = -1; | |
1231 | if(getJamoCE32s(jamoCE32s, errorCode)) { | |
1232 | jamoIndex = ce32s.size(); | |
1233 | for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) { | |
1234 | ce32s.addElement((int32_t)jamoCE32s[i], errorCode); | |
1235 | } | |
1236 | // Small optimization: Use a bit in the Hangul ce32 | |
1237 | // to indicate that none of the Jamo CE32s are isSpecialCE32() | |
1238 | // (as it should be in the root collator). | |
1239 | // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. | |
1240 | // In order to still have good trie compression and keep this code simple, | |
1241 | // we only set this flag if a whole block of 588 Hangul syllables starting with | |
1242 | // a common leading consonant (Jamo L) has this property. | |
1243 | UBool isAnyJamoVTSpecial = FALSE; | |
1244 | for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) { | |
1245 | if(Collation::isSpecialCE32(jamoCE32s[i])) { | |
1246 | isAnyJamoVTSpecial = TRUE; | |
1247 | break; | |
1248 | } | |
1249 | } | |
1250 | uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); | |
1251 | UChar32 c = Hangul::HANGUL_BASE; | |
1252 | for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L | |
1253 | uint32_t ce32 = hangulCE32; | |
1254 | if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) { | |
1255 | ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO; | |
1256 | } | |
1257 | UChar32 limit = c + Hangul::JAMO_VT_COUNT; | |
1258 | utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); | |
1259 | c = limit; | |
1260 | } | |
1261 | } else { | |
1262 | // Copy the Hangul CE32s from the base in blocks per Jamo L, | |
1263 | // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. | |
1264 | for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) { | |
1265 | uint32_t ce32 = base->getCE32(c); | |
1266 | U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG)); | |
1267 | UChar32 limit = c + Hangul::JAMO_VT_COUNT; | |
1268 | utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); | |
1269 | c = limit; | |
1270 | } | |
1271 | } | |
1272 | ||
1273 | setDigitTags(errorCode); | |
1274 | setLeadSurrogates(errorCode); | |
1275 | ||
1276 | // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. | |
1277 | ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0); | |
1278 | utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode); | |
1279 | ||
1280 | utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode); | |
1281 | if(U_FAILURE(errorCode)) { return; } | |
1282 | ||
1283 | // Mark each lead surrogate as "unsafe" | |
1284 | // if any of its 1024 associated supplementary code points is "unsafe". | |
1285 | UChar32 c = 0x10000; | |
1286 | for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { | |
1287 | if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { | |
1288 | unsafeBackwardSet.add(lead); | |
1289 | } | |
1290 | } | |
1291 | unsafeBackwardSet.freeze(); | |
1292 | ||
1293 | data.trie = trie; | |
1294 | data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer()); | |
1295 | data.ces = ce64s.getBuffer(); | |
1296 | data.contexts = contexts.getBuffer(); | |
1297 | ||
1298 | data.ce32sLength = ce32s.size(); | |
1299 | data.cesLength = ce64s.size(); | |
1300 | data.contextsLength = contexts.length(); | |
1301 | ||
1302 | data.base = base; | |
1303 | if(jamoIndex >= 0) { | |
1304 | data.jamoCE32s = data.ce32s + jamoIndex; | |
1305 | } else { | |
1306 | data.jamoCE32s = base->jamoCE32s; | |
1307 | } | |
1308 | data.unsafeBackwardSet = &unsafeBackwardSet; | |
1309 | } | |
1310 | ||
1311 | void | |
1312 | CollationDataBuilder::clearContexts() { | |
1313 | contexts.remove(); | |
1314 | UnicodeSetIterator iter(contextChars); | |
1315 | while(iter.next()) { | |
1316 | U_ASSERT(!iter.isString()); | |
1317 | uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint()); | |
1318 | U_ASSERT(isBuilderContextCE32(ce32)); | |
1319 | getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32; | |
1320 | } | |
1321 | } | |
1322 | ||
1323 | void | |
1324 | CollationDataBuilder::buildContexts(UErrorCode &errorCode) { | |
1325 | if(U_FAILURE(errorCode)) { return; } | |
1326 | // Ignore abandoned lists and the cached builtCE32, | |
1327 | // and build all contexts from scratch. | |
1328 | contexts.remove(); | |
1329 | UnicodeSetIterator iter(contextChars); | |
1330 | while(U_SUCCESS(errorCode) && iter.next()) { | |
1331 | U_ASSERT(!iter.isString()); | |
1332 | UChar32 c = iter.getCodepoint(); | |
1333 | uint32_t ce32 = utrie2_get32(trie, c); | |
1334 | if(!isBuilderContextCE32(ce32)) { | |
1335 | // Impossible: No context data for c in contextChars. | |
1336 | errorCode = U_INTERNAL_PROGRAM_ERROR; | |
1337 | return; | |
1338 | } | |
1339 | ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32); | |
1340 | ce32 = buildContext(cond, errorCode); | |
1341 | utrie2_set32(trie, c, ce32, &errorCode); | |
1342 | } | |
1343 | } | |
1344 | ||
1345 | uint32_t | |
1346 | CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) { | |
1347 | if(U_FAILURE(errorCode)) { return 0; } | |
1348 | // The list head must have no context. | |
1349 | U_ASSERT(!head->hasContext()); | |
1350 | // The list head must be followed by one or more nodes that all do have context. | |
1351 | U_ASSERT(head->next >= 0); | |
1352 | UCharsTrieBuilder prefixBuilder(errorCode); | |
1353 | UCharsTrieBuilder contractionBuilder(errorCode); | |
1354 | for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) { | |
1355 | // After the list head, the prefix or suffix can be empty, but not both. | |
1356 | U_ASSERT(cond == head || cond->hasContext()); | |
1357 | int32_t prefixLength = cond->prefixLength(); | |
1358 | UnicodeString prefix(cond->context, 0, prefixLength + 1); | |
1359 | // Collect all contraction suffixes for one prefix. | |
1360 | ConditionalCE32 *firstCond = cond; | |
1361 | ConditionalCE32 *lastCond = cond; | |
1362 | while(cond->next >= 0 && | |
1363 | (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) { | |
1364 | lastCond = cond; | |
1365 | } | |
1366 | uint32_t ce32; | |
1367 | int32_t suffixStart = prefixLength + 1; // == prefix.length() | |
1368 | if(lastCond->context.length() == suffixStart) { | |
1369 | // One prefix without contraction suffix. | |
1370 | U_ASSERT(firstCond == lastCond); | |
1371 | ce32 = lastCond->ce32; | |
1372 | cond = lastCond; | |
1373 | } else { | |
1374 | // Build the contractions trie. | |
1375 | contractionBuilder.clear(); | |
1376 | // Entry for an empty suffix, to be stored before the trie. | |
1377 | uint32_t emptySuffixCE32; | |
1378 | uint32_t flags = 0; | |
1379 | if(firstCond->context.length() == suffixStart) { | |
1380 | // There is a mapping for the prefix and the single character c. (p|c) | |
1381 | // If no other suffix matches, then we return this value. | |
1382 | emptySuffixCE32 = firstCond->ce32; | |
1383 | cond = getConditionalCE32(firstCond->next); | |
1384 | } else { | |
1385 | // There is no mapping for the prefix and just the single character. | |
1386 | // (There is no p|c, only p|cd, p|ce etc.) | |
1387 | flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH; | |
1388 | // When the prefix matches but none of the prefix-specific suffixes, | |
1389 | // then we fall back to the mappings with the next-longest prefix, | |
1390 | // and ultimately to mappings with no prefix. | |
1391 | // Each fallback might be another set of contractions. | |
1392 | // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, | |
1393 | // then in text "pch" we find the ch contraction. | |
1394 | for(cond = head;; cond = getConditionalCE32(cond->next)) { | |
1395 | int32_t length = cond->prefixLength(); | |
1396 | if(length == prefixLength) { break; } | |
1397 | if(cond->defaultCE32 != Collation::NO_CE32 && | |
1398 | (length==0 || prefix.endsWith(cond->context, 1, length))) { | |
1399 | emptySuffixCE32 = cond->defaultCE32; | |
1400 | } | |
1401 | } | |
1402 | cond = firstCond; | |
1403 | } | |
1404 | // Optimization: Set a flag when | |
1405 | // the first character of every contraction suffix has lccc!=0. | |
1406 | // Short-circuits contraction matching when a normal letter follows. | |
1407 | flags |= Collation::CONTRACT_NEXT_CCC; | |
1408 | // Add all of the non-empty suffixes into the contraction trie. | |
1409 | for(;;) { | |
1410 | UnicodeString suffix(cond->context, suffixStart); | |
1411 | uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0)); | |
1412 | if(fcd16 <= 0xff) { | |
1413 | flags &= ~Collation::CONTRACT_NEXT_CCC; | |
1414 | } | |
1415 | fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1)); | |
1416 | if(fcd16 > 0xff) { | |
1417 | // The last suffix character has lccc!=0, allowing for discontiguous contractions. | |
1418 | flags |= Collation::CONTRACT_TRAILING_CCC; | |
1419 | } | |
1420 | contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode); | |
1421 | if(cond == lastCond) { break; } | |
1422 | cond = getConditionalCE32(cond->next); | |
1423 | } | |
1424 | int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode); | |
1425 | if(U_FAILURE(errorCode)) { return 0; } | |
1426 | if(index > Collation::MAX_INDEX) { | |
1427 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
1428 | return 0; | |
1429 | } | |
1430 | ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags; | |
1431 | } | |
1432 | U_ASSERT(cond == lastCond); | |
1433 | firstCond->defaultCE32 = ce32; | |
1434 | if(prefixLength == 0) { | |
1435 | if(cond->next < 0) { | |
1436 | // No non-empty prefixes, only contractions. | |
1437 | return ce32; | |
1438 | } | |
1439 | } else { | |
1440 | prefix.remove(0, 1); // Remove the length unit. | |
1441 | prefix.reverse(); | |
1442 | prefixBuilder.add(prefix, (int32_t)ce32, errorCode); | |
1443 | if(cond->next < 0) { break; } | |
1444 | } | |
1445 | } | |
1446 | U_ASSERT(head->defaultCE32 != Collation::NO_CE32); | |
1447 | int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode); | |
1448 | if(U_FAILURE(errorCode)) { return 0; } | |
1449 | if(index > Collation::MAX_INDEX) { | |
1450 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
1451 | return 0; | |
1452 | } | |
1453 | return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index); | |
1454 | } | |
1455 | ||
1456 | int32_t | |
1457 | CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, | |
1458 | UErrorCode &errorCode) { | |
1459 | UnicodeString context; | |
1460 | context.append((UChar)(defaultCE32 >> 16)).append((UChar)defaultCE32); | |
1461 | UnicodeString trieString; | |
1462 | context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode)); | |
1463 | if(U_FAILURE(errorCode)) { return -1; } | |
1464 | int32_t index = contexts.indexOf(context); | |
1465 | if(index < 0) { | |
1466 | index = contexts.length(); | |
1467 | contexts.append(context); | |
1468 | } | |
1469 | return index; | |
1470 | } | |
1471 | ||
1472 | void | |
1473 | CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) { | |
1474 | if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; } | |
1475 | ||
1476 | delete fastLatinBuilder; | |
1477 | fastLatinBuilder = new CollationFastLatinBuilder(errorCode); | |
1478 | if(fastLatinBuilder == NULL) { | |
1479 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
1480 | return; | |
1481 | } | |
1482 | if(fastLatinBuilder->forData(data, errorCode)) { | |
1483 | const uint16_t *table = fastLatinBuilder->getTable(); | |
1484 | int32_t length = fastLatinBuilder->lengthOfTable(); | |
1485 | if(base != NULL && length == base->fastLatinTableLength && | |
1486 | uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) { | |
1487 | // Same fast Latin table as in the base, use that one instead. | |
1488 | delete fastLatinBuilder; | |
1489 | fastLatinBuilder = NULL; | |
1490 | table = base->fastLatinTable; | |
1491 | } | |
1492 | data.fastLatinTable = table; | |
1493 | data.fastLatinTableLength = length; | |
1494 | } else { | |
1495 | delete fastLatinBuilder; | |
1496 | fastLatinBuilder = NULL; | |
1497 | } | |
1498 | } | |
1499 | ||
1500 | int32_t | |
1501 | CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) { | |
1502 | return getCEs(s, 0, ces, cesLength); | |
1503 | } | |
1504 | ||
1505 | int32_t | |
1506 | CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s, | |
1507 | int64_t ces[], int32_t cesLength) { | |
1508 | int32_t prefixLength = prefix.length(); | |
1509 | if(prefixLength == 0) { | |
1510 | return getCEs(s, 0, ces, cesLength); | |
1511 | } else { | |
1512 | return getCEs(prefix + s, prefixLength, ces, cesLength); | |
1513 | } | |
1514 | } | |
1515 | ||
1516 | int32_t | |
1517 | CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start, | |
1518 | int64_t ces[], int32_t cesLength) { | |
1519 | if(collIter == NULL) { | |
1520 | collIter = new DataBuilderCollationIterator(*this); | |
1521 | if(collIter == NULL) { return 0; } | |
1522 | } | |
1523 | return collIter->fetchCEs(s, start, ces, cesLength); | |
1524 | } | |
1525 | ||
1526 | U_NAMESPACE_END | |
1527 | ||
1528 | #endif // !UCONFIG_NO_COLLATION |