]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
b331163b | 3 | * Copyright (C) 2012-2015, International Business Machines |
57a6839d A |
4 | * Corporation and others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | * collationdatabuilder.cpp | |
7 | * | |
8 | * (replaced the former ucol_elm.cpp) | |
9 | * | |
10 | * created on: 2012apr01 | |
11 | * created by: Markus W. Scherer | |
12 | */ | |
13 | ||
14 | #include "unicode/utypes.h" | |
15 | ||
16 | #if !UCONFIG_NO_COLLATION | |
17 | ||
18 | #include "unicode/localpointer.h" | |
19 | #include "unicode/uchar.h" | |
20 | #include "unicode/ucharstrie.h" | |
21 | #include "unicode/ucharstriebuilder.h" | |
22 | #include "unicode/uniset.h" | |
23 | #include "unicode/unistr.h" | |
24 | #include "unicode/usetiter.h" | |
25 | #include "unicode/utf16.h" | |
26 | #include "cmemory.h" | |
27 | #include "collation.h" | |
28 | #include "collationdata.h" | |
29 | #include "collationdatabuilder.h" | |
30 | #include "collationfastlatinbuilder.h" | |
31 | #include "collationiterator.h" | |
32 | #include "normalizer2impl.h" | |
33 | #include "utrie2.h" | |
34 | #include "uvectr32.h" | |
35 | #include "uvectr64.h" | |
36 | #include "uvector.h" | |
37 | ||
57a6839d A |
38 | U_NAMESPACE_BEGIN |
39 | ||
40 | CollationDataBuilder::CEModifier::~CEModifier() {} | |
41 | ||
42 | /** | |
43 | * Build-time context and CE32 for a code point. | |
44 | * If a code point has contextual mappings, then the default (no-context) mapping | |
45 | * and all conditional mappings are stored in a singly-linked list | |
46 | * of ConditionalCE32, sorted by context strings. | |
47 | * | |
48 | * Context strings sort by prefix length, then by prefix, then by contraction suffix. | |
49 | * Context strings must be unique and in ascending order. | |
50 | */ | |
51 | struct ConditionalCE32 : public UMemory { | |
b331163b A |
52 | ConditionalCE32() |
53 | : context(), | |
54 | ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), | |
55 | next(-1) {} | |
57a6839d A |
56 | ConditionalCE32(const UnicodeString &ct, uint32_t ce) |
57 | : context(ct), | |
58 | ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), | |
59 | next(-1) {} | |
60 | ||
61 | inline UBool hasContext() const { return context.length() > 1; } | |
62 | inline int32_t prefixLength() const { return context.charAt(0); } | |
63 | ||
64 | /** | |
65 | * "\0" for the first entry for any code point, with its default CE32. | |
66 | * | |
67 | * Otherwise one unit with the length of the prefix string, | |
68 | * then the prefix string, then the contraction suffix. | |
69 | */ | |
70 | UnicodeString context; | |
71 | /** | |
72 | * CE32 for the code point and its context. | |
73 | * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). | |
74 | */ | |
75 | uint32_t ce32; | |
76 | /** | |
77 | * Default CE32 for all contexts with this same prefix. | |
78 | * Initially NO_CE32. Set only while building runtime data structures, | |
79 | * and only on one of the nodes of a sub-list with the same prefix. | |
80 | */ | |
81 | uint32_t defaultCE32; | |
82 | /** | |
83 | * CE32 for the built contexts. | |
84 | * When fetching CEs from the builder, the contexts are built into their runtime form | |
85 | * so that the normal collation implementation can process them. | |
86 | * The result is cached in the list head. It is reset when the contexts are modified. | |
87 | */ | |
88 | uint32_t builtCE32; | |
89 | /** | |
90 | * Index of the next ConditionalCE32. | |
91 | * Negative for the end of the list. | |
92 | */ | |
93 | int32_t next; | |
94 | }; | |
95 | ||
96 | U_CDECL_BEGIN | |
97 | ||
98 | U_CAPI void U_CALLCONV | |
99 | uprv_deleteConditionalCE32(void *obj) { | |
100 | delete static_cast<ConditionalCE32 *>(obj); | |
101 | } | |
102 | ||
103 | U_CDECL_END | |
104 | ||
105 | /** | |
106 | * Build-time collation element and character iterator. | |
107 | * Uses the runtime CollationIterator for fetching CEs for a string | |
108 | * but reads from the builder's unfinished data structures. | |
109 | * In particular, this class reads from the unfinished trie | |
110 | * and has to avoid CollationIterator::nextCE() and redirect other | |
111 | * calls to data->getCE32() and data->getCE32FromSupplementary(). | |
112 | * | |
113 | * We do this so that we need not implement the collation algorithm | |
114 | * again for the builder and make it behave exactly like the runtime code. | |
115 | * That would be more difficult to test and maintain than this indirection. | |
116 | * | |
117 | * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, | |
118 | * so the data accesses from those code paths need not be modified. | |
119 | * | |
120 | * This class iterates directly over whole code points | |
121 | * so that the CollationIterator does not need the finished trie | |
122 | * for handling the LEAD_SURROGATE_TAG. | |
123 | */ | |
124 | class DataBuilderCollationIterator : public CollationIterator { | |
125 | public: | |
126 | DataBuilderCollationIterator(CollationDataBuilder &b); | |
127 | ||
128 | virtual ~DataBuilderCollationIterator(); | |
129 | ||
130 | int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength); | |
131 | ||
132 | virtual void resetToOffset(int32_t newOffset); | |
133 | virtual int32_t getOffset() const; | |
134 | ||
135 | virtual UChar32 nextCodePoint(UErrorCode &errorCode); | |
136 | virtual UChar32 previousCodePoint(UErrorCode &errorCode); | |
137 | ||
138 | protected: | |
139 | virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode); | |
140 | virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode); | |
141 | ||
142 | virtual uint32_t getDataCE32(UChar32 c) const; | |
143 | virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode); | |
144 | ||
145 | CollationDataBuilder &builder; | |
146 | CollationData builderData; | |
147 | uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; | |
148 | const UnicodeString *s; | |
149 | int32_t pos; | |
150 | }; | |
151 | ||
152 | DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b) | |
153 | : CollationIterator(&builderData, /*numeric=*/ FALSE), | |
154 | builder(b), builderData(b.nfcImpl), | |
155 | s(NULL), pos(0) { | |
156 | builderData.base = builder.base; | |
157 | // Set all of the jamoCE32s[] to indirection CE32s. | |
158 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. | |
159 | UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j); | |
160 | jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) | | |
161 | CollationDataBuilder::IS_BUILDER_JAMO_CE32; | |
162 | } | |
163 | builderData.jamoCE32s = jamoCE32s; | |
164 | } | |
165 | ||
166 | DataBuilderCollationIterator::~DataBuilderCollationIterator() {} | |
167 | ||
168 | int32_t | |
169 | DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start, | |
170 | int64_t ces[], int32_t cesLength) { | |
171 | // Set the pointers each time, in case they changed due to reallocation. | |
172 | builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer()); | |
173 | builderData.ces = builder.ce64s.getBuffer(); | |
174 | builderData.contexts = builder.contexts.getBuffer(); | |
175 | // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32(). | |
176 | reset(); | |
177 | s = &str; | |
178 | pos = start; | |
179 | UErrorCode errorCode = U_ZERO_ERROR; | |
180 | while(U_SUCCESS(errorCode) && pos < s->length()) { | |
181 | // No need to keep all CEs in the iterator buffer. | |
182 | clearCEs(); | |
183 | UChar32 c = s->char32At(pos); | |
184 | pos += U16_LENGTH(c); | |
185 | uint32_t ce32 = utrie2_get32(builder.trie, c); | |
186 | const CollationData *d; | |
187 | if(ce32 == Collation::FALLBACK_CE32) { | |
188 | d = builder.base; | |
189 | ce32 = builder.base->getCE32(c); | |
190 | } else { | |
191 | d = &builderData; | |
192 | } | |
193 | appendCEsFromCE32(d, c, ce32, /*forward=*/ TRUE, errorCode); | |
194 | U_ASSERT(U_SUCCESS(errorCode)); | |
195 | for(int32_t i = 0; i < getCEsLength(); ++i) { | |
196 | int64_t ce = getCE(i); | |
197 | if(ce != 0) { | |
198 | if(cesLength < Collation::MAX_EXPANSION_LENGTH) { | |
199 | ces[cesLength] = ce; | |
200 | } | |
201 | ++cesLength; | |
202 | } | |
203 | } | |
204 | } | |
205 | return cesLength; | |
206 | } | |
207 | ||
208 | void | |
209 | DataBuilderCollationIterator::resetToOffset(int32_t newOffset) { | |
210 | reset(); | |
211 | pos = newOffset; | |
212 | } | |
213 | ||
214 | int32_t | |
215 | DataBuilderCollationIterator::getOffset() const { | |
216 | return pos; | |
217 | } | |
218 | ||
219 | UChar32 | |
220 | DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { | |
221 | if(pos == s->length()) { | |
222 | return U_SENTINEL; | |
223 | } | |
224 | UChar32 c = s->char32At(pos); | |
225 | pos += U16_LENGTH(c); | |
226 | return c; | |
227 | } | |
228 | ||
229 | UChar32 | |
230 | DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { | |
231 | if(pos == 0) { | |
232 | return U_SENTINEL; | |
233 | } | |
234 | UChar32 c = s->char32At(pos - 1); | |
235 | pos -= U16_LENGTH(c); | |
236 | return c; | |
237 | } | |
238 | ||
239 | void | |
240 | DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { | |
241 | pos = s->moveIndex32(pos, num); | |
242 | } | |
243 | ||
244 | void | |
245 | DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { | |
246 | pos = s->moveIndex32(pos, -num); | |
247 | } | |
248 | ||
249 | uint32_t | |
250 | DataBuilderCollationIterator::getDataCE32(UChar32 c) const { | |
251 | return utrie2_get32(builder.trie, c); | |
252 | } | |
253 | ||
254 | uint32_t | |
255 | DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) { | |
256 | U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG)); | |
257 | if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) { | |
258 | UChar32 jamo = Collation::indexFromCE32(ce32); | |
259 | return utrie2_get32(builder.trie, jamo); | |
260 | } else { | |
261 | ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32); | |
262 | if(cond->builtCE32 == Collation::NO_CE32) { | |
263 | // Build the context-sensitive mappings into their runtime form and cache the result. | |
264 | cond->builtCE32 = builder.buildContext(cond, errorCode); | |
265 | if(errorCode == U_BUFFER_OVERFLOW_ERROR) { | |
266 | errorCode = U_ZERO_ERROR; | |
267 | builder.clearContexts(); | |
268 | cond->builtCE32 = builder.buildContext(cond, errorCode); | |
269 | } | |
270 | builderData.contexts = builder.contexts.getBuffer(); | |
271 | } | |
272 | return cond->builtCE32; | |
273 | } | |
274 | } | |
275 | ||
276 | // ------------------------------------------------------------------------- *** | |
277 | ||
278 | CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode) | |
279 | : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), | |
280 | base(NULL), baseSettings(NULL), | |
281 | trie(NULL), | |
282 | ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode), | |
283 | modified(FALSE), | |
284 | fastLatinEnabled(FALSE), fastLatinBuilder(NULL), | |
285 | collIter(NULL) { | |
286 | // Reserve the first CE32 for U+0000. | |
287 | ce32s.addElement(0, errorCode); | |
288 | conditionalCE32s.setDeleter(uprv_deleteConditionalCE32); | |
289 | } | |
290 | ||
291 | CollationDataBuilder::~CollationDataBuilder() { | |
292 | utrie2_close(trie); | |
293 | delete fastLatinBuilder; | |
294 | delete collIter; | |
295 | } | |
296 | ||
297 | void | |
298 | CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) { | |
299 | if(U_FAILURE(errorCode)) { return; } | |
300 | if(trie != NULL) { | |
301 | errorCode = U_INVALID_STATE_ERROR; | |
302 | return; | |
303 | } | |
304 | if(b == NULL) { | |
305 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
306 | return; | |
307 | } | |
308 | base = b; | |
309 | ||
310 | // For a tailoring, the default is to fall back to the base. | |
311 | trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode); | |
312 | ||
313 | // Set the Latin-1 letters block so that it is allocated first in the data array, | |
314 | // to try to improve locality of reference when sorting Latin-1 text. | |
315 | // Do not use utrie2_setRange32() since that will not actually allocate blocks | |
316 | // that are filled with the default value. | |
317 | // ASCII (0..7F) is already preallocated anyway. | |
318 | for(UChar32 c = 0xc0; c <= 0xff; ++c) { | |
319 | utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode); | |
320 | } | |
321 | ||
322 | // Hangul syllables are not tailorable (except via tailoring Jamos). | |
323 | // Always set the Hangul tag to help performance. | |
324 | // Do this here, rather than in buildMappings(), | |
325 | // so that we see the HANGUL_TAG in various assertions. | |
326 | uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); | |
327 | utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode); | |
328 | ||
329 | // Copy the set contents but don't copy/clone the set as a whole because | |
330 | // that would copy the isFrozen state too. | |
331 | unsafeBackwardSet.addAll(*b->unsafeBackwardSet); | |
332 | ||
333 | if(U_FAILURE(errorCode)) { return; } | |
334 | } | |
335 | ||
336 | UBool | |
337 | CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end, | |
338 | uint32_t primary, int32_t step, | |
339 | UErrorCode &errorCode) { | |
340 | if(U_FAILURE(errorCode)) { return FALSE; } | |
341 | U_ASSERT(start <= end); | |
342 | // TODO: Do we need to check what values are currently set for start..end? | |
343 | // An offset range is worth it only if we can achieve an overlap between | |
344 | // adjacent UTrie2 blocks of 32 code points each. | |
345 | // An offset CE is also a little more expensive to look up and compute | |
346 | // than a simple CE. | |
347 | // If the range spans at least three UTrie2 block boundaries (> 64 code points), | |
348 | // then we take it. | |
349 | // If the range spans one or two block boundaries and there are | |
350 | // at least 4 code points on either side, then we take it. | |
351 | // (We could additionally require a minimum range length of, say, 16.) | |
352 | int32_t blockDelta = (end >> 5) - (start >> 5); | |
353 | if(2 <= step && step <= 0x7f && | |
354 | (blockDelta >= 3 || | |
355 | (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) { | |
356 | int64_t dataCE = ((int64_t)primary << 32) | (start << 8) | step; | |
357 | if(isCompressiblePrimary(primary)) { dataCE |= 0x80; } | |
358 | int32_t index = addCE(dataCE, errorCode); | |
359 | if(U_FAILURE(errorCode)) { return 0; } | |
360 | if(index > Collation::MAX_INDEX) { | |
361 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
362 | return 0; | |
363 | } | |
364 | uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index); | |
365 | utrie2_setRange32(trie, start, end, offsetCE32, TRUE, &errorCode); | |
366 | modified = TRUE; | |
367 | return TRUE; | |
368 | } else { | |
369 | return FALSE; | |
370 | } | |
371 | } | |
372 | ||
373 | uint32_t | |
374 | CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, | |
375 | uint32_t primary, int32_t step, | |
376 | UErrorCode &errorCode) { | |
377 | if(U_FAILURE(errorCode)) { return 0; } | |
378 | UBool isCompressible = isCompressiblePrimary(primary); | |
379 | if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) { | |
380 | return Collation::incThreeBytePrimaryByOffset(primary, isCompressible, | |
381 | (end - start + 1) * step); | |
382 | } else { | |
383 | // Short range: Set individual CE32s. | |
384 | for(;;) { | |
385 | utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode); | |
386 | ++start; | |
387 | primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step); | |
388 | if(start > end) { return primary; } | |
389 | } | |
390 | modified = TRUE; | |
391 | } | |
392 | } | |
393 | ||
394 | uint32_t | |
395 | CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const { | |
396 | int32_t i = Collation::indexFromCE32(ce32); | |
397 | int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i); | |
398 | uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); | |
399 | return Collation::makeLongPrimaryCE32(p); | |
400 | } | |
401 | ||
402 | UBool | |
403 | CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const { | |
404 | return base->isCompressibleLeadByte(b); | |
405 | } | |
406 | ||
407 | UBool | |
408 | CollationDataBuilder::isAssigned(UChar32 c) const { | |
409 | return Collation::isAssignedCE32(utrie2_get32(trie, c)); | |
410 | } | |
411 | ||
412 | uint32_t | |
413 | CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const { | |
414 | uint32_t ce32 = utrie2_get32(trie, c); | |
415 | if(Collation::isLongPrimaryCE32(ce32)) { | |
416 | return Collation::primaryFromLongPrimaryCE32(ce32); | |
417 | } else { | |
418 | return 0; | |
419 | } | |
420 | } | |
421 | ||
422 | int64_t | |
423 | CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const { | |
424 | if(U_FAILURE(errorCode)) { return 0; } | |
b331163b | 425 | // Keep parallel with CollationData::getSingleCE(). |
57a6839d A |
426 | UBool fromBase = FALSE; |
427 | uint32_t ce32 = utrie2_get32(trie, c); | |
428 | if(ce32 == Collation::FALLBACK_CE32) { | |
429 | fromBase = TRUE; | |
430 | ce32 = base->getCE32(c); | |
431 | } | |
432 | while(Collation::isSpecialCE32(ce32)) { | |
433 | switch(Collation::tagFromCE32(ce32)) { | |
434 | case Collation::LATIN_EXPANSION_TAG: | |
435 | case Collation::BUILDER_DATA_TAG: | |
436 | case Collation::PREFIX_TAG: | |
437 | case Collation::CONTRACTION_TAG: | |
438 | case Collation::HANGUL_TAG: | |
439 | case Collation::LEAD_SURROGATE_TAG: | |
440 | errorCode = U_UNSUPPORTED_ERROR; | |
441 | return 0; | |
442 | case Collation::FALLBACK_TAG: | |
443 | case Collation::RESERVED_TAG_3: | |
444 | errorCode = U_INTERNAL_PROGRAM_ERROR; | |
445 | return 0; | |
446 | case Collation::LONG_PRIMARY_TAG: | |
447 | return Collation::ceFromLongPrimaryCE32(ce32); | |
448 | case Collation::LONG_SECONDARY_TAG: | |
449 | return Collation::ceFromLongSecondaryCE32(ce32); | |
450 | case Collation::EXPANSION32_TAG: | |
451 | if(Collation::lengthFromCE32(ce32) == 1) { | |
452 | int32_t i = Collation::indexFromCE32(ce32); | |
453 | ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i); | |
454 | break; | |
455 | } else { | |
456 | errorCode = U_UNSUPPORTED_ERROR; | |
457 | return 0; | |
458 | } | |
459 | case Collation::EXPANSION_TAG: { | |
460 | if(Collation::lengthFromCE32(ce32) == 1) { | |
461 | int32_t i = Collation::indexFromCE32(ce32); | |
462 | return fromBase ? base->ces[i] : ce64s.elementAti(i); | |
463 | } else { | |
464 | errorCode = U_UNSUPPORTED_ERROR; | |
465 | return 0; | |
466 | } | |
467 | } | |
468 | case Collation::DIGIT_TAG: | |
469 | // Fetch the non-numeric-collation CE32 and continue. | |
470 | ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32)); | |
471 | break; | |
472 | case Collation::U0000_TAG: | |
473 | U_ASSERT(c == 0); | |
474 | // Fetch the normal ce32 for U+0000 and continue. | |
475 | ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0); | |
476 | break; | |
477 | case Collation::OFFSET_TAG: | |
478 | ce32 = getCE32FromOffsetCE32(fromBase, c, ce32); | |
479 | break; | |
480 | case Collation::IMPLICIT_TAG: | |
481 | return Collation::unassignedCEFromCodePoint(c); | |
482 | } | |
483 | } | |
484 | return Collation::ceFromSimpleCE32(ce32); | |
485 | } | |
486 | ||
487 | int32_t | |
488 | CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) { | |
489 | int32_t length = ce64s.size(); | |
490 | for(int32_t i = 0; i < length; ++i) { | |
491 | if(ce == ce64s.elementAti(i)) { return i; } | |
492 | } | |
493 | ce64s.addElement(ce, errorCode); | |
494 | return length; | |
495 | } | |
496 | ||
497 | int32_t | |
498 | CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) { | |
499 | int32_t length = ce32s.size(); | |
500 | for(int32_t i = 0; i < length; ++i) { | |
501 | if(ce32 == (uint32_t)ce32s.elementAti(i)) { return i; } | |
502 | } | |
503 | ce32s.addElement((int32_t)ce32, errorCode); | |
504 | return length; | |
505 | } | |
506 | ||
507 | int32_t | |
508 | CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32, | |
509 | UErrorCode &errorCode) { | |
510 | if(U_FAILURE(errorCode)) { return -1; } | |
511 | U_ASSERT(!context.isEmpty()); | |
512 | int32_t index = conditionalCE32s.size(); | |
513 | if(index > Collation::MAX_INDEX) { | |
514 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
515 | return -1; | |
516 | } | |
517 | ConditionalCE32 *cond = new ConditionalCE32(context, ce32); | |
518 | if(cond == NULL) { | |
519 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
520 | return -1; | |
521 | } | |
522 | conditionalCE32s.addElement(cond, errorCode); | |
523 | return index; | |
524 | } | |
525 | ||
526 | void | |
527 | CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s, | |
528 | const int64_t ces[], int32_t cesLength, | |
529 | UErrorCode &errorCode) { | |
530 | uint32_t ce32 = encodeCEs(ces, cesLength, errorCode); | |
531 | addCE32(prefix, s, ce32, errorCode); | |
532 | } | |
533 | ||
534 | void | |
535 | CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s, | |
536 | uint32_t ce32, UErrorCode &errorCode) { | |
537 | if(U_FAILURE(errorCode)) { return; } | |
538 | if(s.isEmpty()) { | |
539 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
540 | return; | |
541 | } | |
542 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
543 | errorCode = U_INVALID_STATE_ERROR; | |
544 | return; | |
545 | } | |
546 | UChar32 c = s.char32At(0); | |
547 | int32_t cLength = U16_LENGTH(c); | |
548 | uint32_t oldCE32 = utrie2_get32(trie, c); | |
549 | UBool hasContext = !prefix.isEmpty() || s.length() > cLength; | |
550 | if(oldCE32 == Collation::FALLBACK_CE32) { | |
551 | // First tailoring for c. | |
552 | // If c has contextual base mappings or if we add a contextual mapping, | |
553 | // then copy the base mappings. | |
554 | // Otherwise we just override the base mapping. | |
555 | uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c)); | |
556 | if(hasContext || Collation::ce32HasContext(baseCE32)) { | |
557 | oldCE32 = copyFromBaseCE32(c, baseCE32, TRUE, errorCode); | |
558 | utrie2_set32(trie, c, oldCE32, &errorCode); | |
559 | if(U_FAILURE(errorCode)) { return; } | |
560 | } | |
561 | } | |
562 | if(!hasContext) { | |
563 | // No prefix, no contraction. | |
564 | if(!isBuilderContextCE32(oldCE32)) { | |
565 | utrie2_set32(trie, c, ce32, &errorCode); | |
566 | } else { | |
567 | ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32); | |
568 | cond->builtCE32 = Collation::NO_CE32; | |
569 | cond->ce32 = ce32; | |
570 | } | |
571 | } else { | |
572 | ConditionalCE32 *cond; | |
573 | if(!isBuilderContextCE32(oldCE32)) { | |
574 | // Replace the simple oldCE32 with a builder context CE32 | |
575 | // pointing to a new ConditionalCE32 list head. | |
576 | int32_t index = addConditionalCE32(UnicodeString((UChar)0), oldCE32, errorCode); | |
577 | if(U_FAILURE(errorCode)) { return; } | |
578 | uint32_t contextCE32 = makeBuilderContextCE32(index); | |
579 | utrie2_set32(trie, c, contextCE32, &errorCode); | |
580 | contextChars.add(c); | |
581 | cond = getConditionalCE32(index); | |
582 | } else { | |
583 | cond = getConditionalCE32ForCE32(oldCE32); | |
584 | cond->builtCE32 = Collation::NO_CE32; | |
585 | } | |
586 | UnicodeString suffix(s, cLength); | |
587 | UnicodeString context((UChar)prefix.length()); | |
588 | context.append(prefix).append(suffix); | |
589 | unsafeBackwardSet.addAll(suffix); | |
590 | for(;;) { | |
591 | // invariant: context > cond->context | |
592 | int32_t next = cond->next; | |
593 | if(next < 0) { | |
594 | // Append a new ConditionalCE32 after cond. | |
595 | int32_t index = addConditionalCE32(context, ce32, errorCode); | |
596 | if(U_FAILURE(errorCode)) { return; } | |
597 | cond->next = index; | |
598 | break; | |
599 | } | |
600 | ConditionalCE32 *nextCond = getConditionalCE32(next); | |
601 | int8_t cmp = context.compare(nextCond->context); | |
602 | if(cmp < 0) { | |
603 | // Insert a new ConditionalCE32 between cond and nextCond. | |
604 | int32_t index = addConditionalCE32(context, ce32, errorCode); | |
605 | if(U_FAILURE(errorCode)) { return; } | |
606 | cond->next = index; | |
607 | getConditionalCE32(index)->next = next; | |
608 | break; | |
609 | } else if(cmp == 0) { | |
610 | // Same context as before, overwrite its ce32. | |
611 | nextCond->ce32 = ce32; | |
612 | break; | |
613 | } | |
614 | cond = nextCond; | |
615 | } | |
616 | } | |
617 | modified = TRUE; | |
618 | } | |
619 | ||
620 | uint32_t | |
621 | CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) { | |
622 | uint32_t p = (uint32_t)(ce >> 32); | |
623 | uint32_t lower32 = (uint32_t)ce; | |
624 | uint32_t t = (uint32_t)(ce & 0xffff); | |
625 | U_ASSERT((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. | |
626 | if((ce & INT64_C(0xffff00ff00ff)) == 0) { | |
627 | // normal form ppppsstt | |
628 | return p | (lower32 >> 16) | (t >> 8); | |
629 | } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) { | |
630 | // long-primary form ppppppC1 | |
631 | return Collation::makeLongPrimaryCE32(p); | |
632 | } else if(p == 0 && (t & 0xff) == 0) { | |
633 | // long-secondary form ssssttC2 | |
634 | return Collation::makeLongSecondaryCE32(lower32); | |
635 | } | |
636 | return Collation::NO_CE32; | |
637 | } | |
638 | ||
639 | uint32_t | |
640 | CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) { | |
641 | // Try to encode one CE as one CE32. | |
642 | uint32_t ce32 = encodeOneCEAsCE32(ce); | |
643 | if(ce32 != Collation::NO_CE32) { return ce32; } | |
644 | int32_t index = addCE(ce, errorCode); | |
645 | if(U_FAILURE(errorCode)) { return 0; } | |
646 | if(index > Collation::MAX_INDEX) { | |
647 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
648 | return 0; | |
649 | } | |
650 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1); | |
651 | } | |
652 | ||
653 | uint32_t | |
654 | CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, | |
655 | UErrorCode &errorCode) { | |
656 | if(U_FAILURE(errorCode)) { return 0; } | |
657 | if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) { | |
658 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
659 | return 0; | |
660 | } | |
661 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
662 | errorCode = U_INVALID_STATE_ERROR; | |
663 | return 0; | |
664 | } | |
665 | if(cesLength == 0) { | |
666 | // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. | |
667 | // Do this here so that callers need not do it. | |
668 | return encodeOneCEAsCE32(0); | |
669 | } else if(cesLength == 1) { | |
670 | return encodeOneCE(ces[0], errorCode); | |
671 | } else if(cesLength == 2) { | |
672 | // Try to encode two CEs as one CE32. | |
673 | int64_t ce0 = ces[0]; | |
674 | int64_t ce1 = ces[1]; | |
675 | uint32_t p0 = (uint32_t)(ce0 >> 32); | |
676 | if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE && | |
677 | (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE && | |
678 | p0 != 0) { | |
679 | // Latin mini expansion | |
680 | return | |
681 | p0 | | |
682 | (((uint32_t)ce0 & 0xff00u) << 8) | | |
683 | (uint32_t)(ce1 >> 16) | | |
684 | Collation::SPECIAL_CE32_LOW_BYTE | | |
685 | Collation::LATIN_EXPANSION_TAG; | |
686 | } | |
687 | } | |
688 | // Try to encode two or more CEs as CE32s. | |
689 | int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH]; | |
690 | for(int32_t i = 0;; ++i) { | |
691 | if(i == cesLength) { | |
692 | return encodeExpansion32(newCE32s, cesLength, errorCode); | |
693 | } | |
694 | uint32_t ce32 = encodeOneCEAsCE32(ces[i]); | |
695 | if(ce32 == Collation::NO_CE32) { break; } | |
696 | newCE32s[i] = (int32_t)ce32; | |
697 | } | |
698 | return encodeExpansion(ces, cesLength, errorCode); | |
699 | } | |
700 | ||
701 | uint32_t | |
702 | CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) { | |
703 | if(U_FAILURE(errorCode)) { return 0; } | |
704 | // See if this sequence of CEs has already been stored. | |
705 | int64_t first = ces[0]; | |
706 | int32_t ce64sMax = ce64s.size() - length; | |
707 | for(int32_t i = 0; i <= ce64sMax; ++i) { | |
708 | if(first == ce64s.elementAti(i)) { | |
709 | if(i > Collation::MAX_INDEX) { | |
710 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
711 | return 0; | |
712 | } | |
713 | for(int32_t j = 1;; ++j) { | |
714 | if(j == length) { | |
715 | return Collation::makeCE32FromTagIndexAndLength( | |
716 | Collation::EXPANSION_TAG, i, length); | |
717 | } | |
718 | if(ce64s.elementAti(i + j) != ces[j]) { break; } | |
719 | } | |
720 | } | |
721 | } | |
722 | // Store the new sequence. | |
723 | int32_t i = ce64s.size(); | |
724 | if(i > Collation::MAX_INDEX) { | |
725 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
726 | return 0; | |
727 | } | |
728 | for(int32_t j = 0; j < length; ++j) { | |
729 | ce64s.addElement(ces[j], errorCode); | |
730 | } | |
731 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length); | |
732 | } | |
733 | ||
734 | uint32_t | |
735 | CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length, | |
736 | UErrorCode &errorCode) { | |
737 | if(U_FAILURE(errorCode)) { return 0; } | |
738 | // See if this sequence of CE32s has already been stored. | |
739 | int32_t first = newCE32s[0]; | |
740 | int32_t ce32sMax = ce32s.size() - length; | |
741 | for(int32_t i = 0; i <= ce32sMax; ++i) { | |
742 | if(first == ce32s.elementAti(i)) { | |
743 | if(i > Collation::MAX_INDEX) { | |
744 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
745 | return 0; | |
746 | } | |
747 | for(int32_t j = 1;; ++j) { | |
748 | if(j == length) { | |
749 | return Collation::makeCE32FromTagIndexAndLength( | |
750 | Collation::EXPANSION32_TAG, i, length); | |
751 | } | |
752 | if(ce32s.elementAti(i + j) != newCE32s[j]) { break; } | |
753 | } | |
754 | } | |
755 | } | |
756 | // Store the new sequence. | |
757 | int32_t i = ce32s.size(); | |
758 | if(i > Collation::MAX_INDEX) { | |
759 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
760 | return 0; | |
761 | } | |
762 | for(int32_t j = 0; j < length; ++j) { | |
763 | ce32s.addElement(newCE32s[j], errorCode); | |
764 | } | |
765 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length); | |
766 | } | |
767 | ||
768 | uint32_t | |
769 | CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, | |
770 | UErrorCode &errorCode) { | |
771 | if(U_FAILURE(errorCode)) { return 0; } | |
772 | if(!Collation::isSpecialCE32(ce32)) { return ce32; } | |
773 | switch(Collation::tagFromCE32(ce32)) { | |
774 | case Collation::LONG_PRIMARY_TAG: | |
775 | case Collation::LONG_SECONDARY_TAG: | |
776 | case Collation::LATIN_EXPANSION_TAG: | |
777 | // copy as is | |
778 | break; | |
779 | case Collation::EXPANSION32_TAG: { | |
780 | const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32); | |
781 | int32_t length = Collation::lengthFromCE32(ce32); | |
782 | ce32 = encodeExpansion32( | |
783 | reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode); | |
784 | break; | |
785 | } | |
786 | case Collation::EXPANSION_TAG: { | |
787 | const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32); | |
788 | int32_t length = Collation::lengthFromCE32(ce32); | |
789 | ce32 = encodeExpansion(baseCEs, length, errorCode); | |
790 | break; | |
791 | } | |
792 | case Collation::PREFIX_TAG: { | |
793 | // Flatten prefixes and nested suffixes (contractions) | |
794 | // into a linear list of ConditionalCE32. | |
795 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); | |
796 | ce32 = CollationData::readCE32(p); // Default if no prefix match. | |
797 | if(!withContext) { | |
798 | return copyFromBaseCE32(c, ce32, FALSE, errorCode); | |
799 | } | |
b331163b | 800 | ConditionalCE32 head; |
57a6839d A |
801 | UnicodeString context((UChar)0); |
802 | int32_t index; | |
803 | if(Collation::isContractionCE32(ce32)) { | |
804 | index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); | |
805 | } else { | |
806 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
807 | head.next = index = addConditionalCE32(context, ce32, errorCode); | |
808 | } | |
809 | if(U_FAILURE(errorCode)) { return 0; } | |
810 | ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far | |
811 | UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); | |
812 | while(prefixes.next(errorCode)) { | |
813 | context = prefixes.getString(); | |
814 | context.reverse(); | |
815 | context.insert(0, (UChar)context.length()); | |
816 | ce32 = (uint32_t)prefixes.getValue(); | |
817 | if(Collation::isContractionCE32(ce32)) { | |
818 | index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode); | |
819 | } else { | |
820 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
821 | cond->next = index = addConditionalCE32(context, ce32, errorCode); | |
822 | } | |
823 | if(U_FAILURE(errorCode)) { return 0; } | |
824 | cond = getConditionalCE32(index); | |
825 | } | |
826 | ce32 = makeBuilderContextCE32(head.next); | |
827 | contextChars.add(c); | |
828 | break; | |
829 | } | |
830 | case Collation::CONTRACTION_TAG: { | |
831 | if(!withContext) { | |
832 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); | |
833 | ce32 = CollationData::readCE32(p); // Default if no suffix match. | |
834 | return copyFromBaseCE32(c, ce32, FALSE, errorCode); | |
835 | } | |
b331163b | 836 | ConditionalCE32 head; |
57a6839d A |
837 | UnicodeString context((UChar)0); |
838 | copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); | |
839 | ce32 = makeBuilderContextCE32(head.next); | |
840 | contextChars.add(c); | |
841 | break; | |
842 | } | |
843 | case Collation::HANGUL_TAG: | |
844 | errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables. | |
845 | break; | |
846 | case Collation::OFFSET_TAG: | |
847 | ce32 = getCE32FromOffsetCE32(TRUE, c, ce32); | |
848 | break; | |
849 | case Collation::IMPLICIT_TAG: | |
850 | ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode); | |
851 | break; | |
852 | default: | |
853 | U_ASSERT(FALSE); // require ce32 == base->getFinalCE32(ce32) | |
854 | break; | |
855 | } | |
856 | return ce32; | |
857 | } | |
858 | ||
859 | int32_t | |
860 | CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, | |
861 | ConditionalCE32 *cond, UErrorCode &errorCode) { | |
862 | if(U_FAILURE(errorCode)) { return 0; } | |
863 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); | |
864 | int32_t index; | |
865 | if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { | |
866 | // No match on the single code point. | |
867 | // We are underneath a prefix, and the default mapping is just | |
868 | // a fallback to the mappings for a shorter prefix. | |
869 | U_ASSERT(context.length() > 1); | |
870 | index = -1; | |
871 | } else { | |
872 | ce32 = CollationData::readCE32(p); // Default if no suffix match. | |
873 | U_ASSERT(!Collation::isContractionCE32(ce32)); | |
874 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
875 | cond->next = index = addConditionalCE32(context, ce32, errorCode); | |
876 | if(U_FAILURE(errorCode)) { return 0; } | |
877 | cond = getConditionalCE32(index); | |
878 | } | |
879 | ||
880 | int32_t suffixStart = context.length(); | |
881 | UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); | |
882 | while(suffixes.next(errorCode)) { | |
883 | context.append(suffixes.getString()); | |
884 | ce32 = copyFromBaseCE32(c, (uint32_t)suffixes.getValue(), TRUE, errorCode); | |
885 | cond->next = index = addConditionalCE32(context, ce32, errorCode); | |
886 | if(U_FAILURE(errorCode)) { return 0; } | |
887 | // No need to update the unsafeBackwardSet because the tailoring set | |
888 | // is already a copy of the base set. | |
889 | cond = getConditionalCE32(index); | |
890 | context.truncate(suffixStart); | |
891 | } | |
892 | U_ASSERT(index >= 0); | |
893 | return index; | |
894 | } | |
895 | ||
896 | class CopyHelper { | |
897 | public: | |
898 | CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d, | |
899 | const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode) | |
900 | : src(s), dest(d), modifier(m), | |
901 | errorCode(initialErrorCode) {} | |
902 | ||
903 | UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) { | |
904 | ce32 = copyCE32(ce32); | |
905 | utrie2_setRange32(dest.trie, start, end, ce32, TRUE, &errorCode); | |
906 | if(CollationDataBuilder::isBuilderContextCE32(ce32)) { | |
907 | dest.contextChars.add(start, end); | |
908 | } | |
909 | return U_SUCCESS(errorCode); | |
910 | } | |
911 | ||
912 | uint32_t copyCE32(uint32_t ce32) { | |
913 | if(!Collation::isSpecialCE32(ce32)) { | |
914 | int64_t ce = modifier.modifyCE32(ce32); | |
915 | if(ce != Collation::NO_CE) { | |
916 | ce32 = dest.encodeOneCE(ce, errorCode); | |
917 | } | |
918 | } else { | |
919 | int32_t tag = Collation::tagFromCE32(ce32); | |
920 | if(tag == Collation::EXPANSION32_TAG) { | |
921 | const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer()); | |
922 | srcCE32s += Collation::indexFromCE32(ce32); | |
923 | int32_t length = Collation::lengthFromCE32(ce32); | |
924 | // Inspect the source CE32s. Just copy them if none are modified. | |
925 | // Otherwise copy to modifiedCEs, with modifications. | |
926 | UBool isModified = FALSE; | |
927 | for(int32_t i = 0; i < length; ++i) { | |
928 | ce32 = srcCE32s[i]; | |
929 | int64_t ce; | |
930 | if(Collation::isSpecialCE32(ce32) || | |
931 | (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) { | |
932 | if(isModified) { | |
933 | modifiedCEs[i] = Collation::ceFromCE32(ce32); | |
934 | } | |
935 | } else { | |
936 | if(!isModified) { | |
937 | for(int32_t j = 0; j < i; ++j) { | |
938 | modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]); | |
939 | } | |
940 | isModified = TRUE; | |
941 | } | |
942 | modifiedCEs[i] = ce; | |
943 | } | |
944 | } | |
945 | if(isModified) { | |
946 | ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); | |
947 | } else { | |
948 | ce32 = dest.encodeExpansion32( | |
949 | reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode); | |
950 | } | |
951 | } else if(tag == Collation::EXPANSION_TAG) { | |
952 | const int64_t *srcCEs = src.ce64s.getBuffer(); | |
953 | srcCEs += Collation::indexFromCE32(ce32); | |
954 | int32_t length = Collation::lengthFromCE32(ce32); | |
955 | // Inspect the source CEs. Just copy them if none are modified. | |
956 | // Otherwise copy to modifiedCEs, with modifications. | |
957 | UBool isModified = FALSE; | |
958 | for(int32_t i = 0; i < length; ++i) { | |
959 | int64_t srcCE = srcCEs[i]; | |
960 | int64_t ce = modifier.modifyCE(srcCE); | |
961 | if(ce == Collation::NO_CE) { | |
962 | if(isModified) { | |
963 | modifiedCEs[i] = srcCE; | |
964 | } | |
965 | } else { | |
966 | if(!isModified) { | |
967 | for(int32_t j = 0; j < i; ++j) { | |
968 | modifiedCEs[j] = srcCEs[j]; | |
969 | } | |
970 | isModified = TRUE; | |
971 | } | |
972 | modifiedCEs[i] = ce; | |
973 | } | |
974 | } | |
975 | if(isModified) { | |
976 | ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); | |
977 | } else { | |
978 | ce32 = dest.encodeExpansion(srcCEs, length, errorCode); | |
979 | } | |
980 | } else if(tag == Collation::BUILDER_DATA_TAG) { | |
981 | // Copy the list of ConditionalCE32. | |
982 | ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32); | |
983 | U_ASSERT(!cond->hasContext()); | |
984 | int32_t destIndex = dest.addConditionalCE32( | |
985 | cond->context, copyCE32(cond->ce32), errorCode); | |
986 | ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex); | |
987 | while(cond->next >= 0) { | |
988 | cond = src.getConditionalCE32(cond->next); | |
989 | ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex); | |
990 | destIndex = dest.addConditionalCE32( | |
991 | cond->context, copyCE32(cond->ce32), errorCode); | |
992 | int32_t suffixStart = cond->prefixLength() + 1; | |
993 | dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart)); | |
994 | prevDestCond->next = destIndex; | |
995 | } | |
996 | } else { | |
997 | // Just copy long CEs and Latin mini expansions (and other expected values) as is, | |
998 | // assuming that the modifier would not modify them. | |
999 | U_ASSERT(tag == Collation::LONG_PRIMARY_TAG || | |
1000 | tag == Collation::LONG_SECONDARY_TAG || | |
1001 | tag == Collation::LATIN_EXPANSION_TAG || | |
1002 | tag == Collation::HANGUL_TAG); | |
1003 | } | |
1004 | } | |
1005 | return ce32; | |
1006 | } | |
1007 | ||
1008 | const CollationDataBuilder &src; | |
1009 | CollationDataBuilder &dest; | |
1010 | const CollationDataBuilder::CEModifier &modifier; | |
1011 | int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH]; | |
1012 | UErrorCode errorCode; | |
1013 | }; | |
1014 | ||
1015 | U_CDECL_BEGIN | |
1016 | ||
1017 | static UBool U_CALLCONV | |
1018 | enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) { | |
1019 | return | |
1020 | value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 || | |
1021 | ((CopyHelper *)context)->copyRangeCE32(start, end, value); | |
1022 | } | |
1023 | ||
1024 | U_CDECL_END | |
1025 | ||
1026 | void | |
1027 | CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, | |
1028 | UErrorCode &errorCode) { | |
1029 | if(U_FAILURE(errorCode)) { return; } | |
1030 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
1031 | errorCode = U_INVALID_STATE_ERROR; | |
1032 | return; | |
1033 | } | |
1034 | CopyHelper helper(src, *this, modifier, errorCode); | |
1035 | utrie2_enum(src.trie, NULL, enumRangeForCopy, &helper); | |
1036 | errorCode = helper.errorCode; | |
1037 | // Update the contextChars and the unsafeBackwardSet while copying, | |
1038 | // in case a character had conditional mappings in the source builder | |
1039 | // and they were removed later. | |
1040 | modified |= src.modified; | |
1041 | } | |
1042 | ||
1043 | void | |
1044 | CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) { | |
1045 | if(U_FAILURE(errorCode) || set.isEmpty()) { return; } | |
1046 | UnicodeSetIterator iter(set); | |
1047 | while(iter.next() && !iter.isString()) { | |
1048 | UChar32 c = iter.getCodepoint(); | |
1049 | uint32_t ce32 = utrie2_get32(trie, c); | |
1050 | if(ce32 == Collation::FALLBACK_CE32) { | |
1051 | ce32 = base->getFinalCE32(base->getCE32(c)); | |
1052 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
1053 | utrie2_set32(trie, c, ce32, &errorCode); | |
1054 | } | |
1055 | } | |
1056 | modified = TRUE; | |
1057 | } | |
1058 | ||
1059 | void | |
1060 | CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) { | |
1061 | if(U_FAILURE(errorCode) || set.isEmpty()) { return; } | |
1062 | UnicodeSetIterator iter(set); | |
1063 | while(iter.next() && !iter.isString()) { | |
1064 | UChar32 c = iter.getCodepoint(); | |
1065 | uint32_t ce32 = utrie2_get32(trie, c); | |
1066 | if(ce32 == Collation::FALLBACK_CE32) { | |
1067 | ce32 = base->getFinalCE32(base->getCE32(c)); | |
1068 | if(Collation::ce32HasContext(ce32)) { | |
1069 | ce32 = copyFromBaseCE32(c, ce32, FALSE /* without context */, errorCode); | |
1070 | utrie2_set32(trie, c, ce32, &errorCode); | |
1071 | } | |
1072 | } else if(isBuilderContextCE32(ce32)) { | |
1073 | ce32 = getConditionalCE32ForCE32(ce32)->ce32; | |
1074 | // Simply abandon the list of ConditionalCE32. | |
1075 | // The caller will copy this builder in the end, | |
1076 | // eliminating unreachable data. | |
1077 | utrie2_set32(trie, c, ce32, &errorCode); | |
1078 | contextChars.remove(c); | |
1079 | } | |
1080 | } | |
1081 | modified = TRUE; | |
1082 | } | |
1083 | ||
1084 | UBool | |
1085 | CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) { | |
1086 | if(U_FAILURE(errorCode)) { return FALSE; } | |
1087 | UBool anyJamoAssigned = base == NULL; // always set jamoCE32s in the base data | |
1088 | UBool needToCopyFromBase = FALSE; | |
1089 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. | |
1090 | UChar32 jamo = jamoCpFromIndex(j); | |
1091 | UBool fromBase = FALSE; | |
1092 | uint32_t ce32 = utrie2_get32(trie, jamo); | |
1093 | anyJamoAssigned |= Collation::isAssignedCE32(ce32); | |
1094 | // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. | |
1095 | // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) | |
1096 | if(ce32 == Collation::FALLBACK_CE32) { | |
1097 | fromBase = TRUE; | |
1098 | ce32 = base->getCE32(jamo); | |
1099 | } | |
1100 | if(Collation::isSpecialCE32(ce32)) { | |
1101 | switch(Collation::tagFromCE32(ce32)) { | |
1102 | case Collation::LONG_PRIMARY_TAG: | |
1103 | case Collation::LONG_SECONDARY_TAG: | |
1104 | case Collation::LATIN_EXPANSION_TAG: | |
1105 | // Copy the ce32 as-is. | |
1106 | break; | |
1107 | case Collation::EXPANSION32_TAG: | |
1108 | case Collation::EXPANSION_TAG: | |
1109 | case Collation::PREFIX_TAG: | |
1110 | case Collation::CONTRACTION_TAG: | |
1111 | if(fromBase) { | |
1112 | // Defer copying until we know if anyJamoAssigned. | |
1113 | ce32 = Collation::FALLBACK_CE32; | |
1114 | needToCopyFromBase = TRUE; | |
1115 | } | |
1116 | break; | |
1117 | case Collation::IMPLICIT_TAG: | |
1118 | // An unassigned Jamo should only occur in tests with incomplete bases. | |
1119 | U_ASSERT(fromBase); | |
1120 | ce32 = Collation::FALLBACK_CE32; | |
1121 | needToCopyFromBase = TRUE; | |
1122 | break; | |
1123 | case Collation::OFFSET_TAG: | |
1124 | ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); | |
1125 | break; | |
1126 | case Collation::FALLBACK_TAG: | |
1127 | case Collation::RESERVED_TAG_3: | |
1128 | case Collation::BUILDER_DATA_TAG: | |
1129 | case Collation::DIGIT_TAG: | |
1130 | case Collation::U0000_TAG: | |
1131 | case Collation::HANGUL_TAG: | |
1132 | case Collation::LEAD_SURROGATE_TAG: | |
1133 | errorCode = U_INTERNAL_PROGRAM_ERROR; | |
1134 | return FALSE; | |
1135 | } | |
1136 | } | |
1137 | jamoCE32s[j] = ce32; | |
1138 | } | |
1139 | if(anyJamoAssigned && needToCopyFromBase) { | |
1140 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { | |
1141 | if(jamoCE32s[j] == Collation::FALLBACK_CE32) { | |
1142 | UChar32 jamo = jamoCpFromIndex(j); | |
1143 | jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo), | |
1144 | /*withContext=*/ TRUE, errorCode); | |
1145 | } | |
1146 | } | |
1147 | } | |
1148 | return anyJamoAssigned && U_SUCCESS(errorCode); | |
1149 | } | |
1150 | ||
1151 | void | |
1152 | CollationDataBuilder::setDigitTags(UErrorCode &errorCode) { | |
1153 | UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode); | |
1154 | if(U_FAILURE(errorCode)) { return; } | |
1155 | UnicodeSetIterator iter(digits); | |
1156 | while(iter.next()) { | |
1157 | U_ASSERT(!iter.isString()); | |
1158 | UChar32 c = iter.getCodepoint(); | |
1159 | uint32_t ce32 = utrie2_get32(trie, c); | |
1160 | if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) { | |
1161 | int32_t index = addCE32(ce32, errorCode); | |
1162 | if(U_FAILURE(errorCode)) { return; } | |
1163 | if(index > Collation::MAX_INDEX) { | |
1164 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
1165 | return; | |
1166 | } | |
1167 | ce32 = Collation::makeCE32FromTagIndexAndLength( | |
1168 | Collation::DIGIT_TAG, index, u_charDigitValue(c)); | |
1169 | utrie2_set32(trie, c, ce32, &errorCode); | |
1170 | } | |
1171 | } | |
1172 | } | |
1173 | ||
1174 | U_CDECL_BEGIN | |
1175 | ||
1176 | static UBool U_CALLCONV | |
1177 | enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { | |
1178 | int32_t *pValue = (int32_t *)context; | |
1179 | if(value == Collation::UNASSIGNED_CE32) { | |
1180 | value = Collation::LEAD_ALL_UNASSIGNED; | |
1181 | } else if(value == Collation::FALLBACK_CE32) { | |
1182 | value = Collation::LEAD_ALL_FALLBACK; | |
1183 | } else { | |
1184 | *pValue = Collation::LEAD_MIXED; | |
1185 | return FALSE; | |
1186 | } | |
1187 | if(*pValue < 0) { | |
1188 | *pValue = (int32_t)value; | |
1189 | } else if(*pValue != (int32_t)value) { | |
1190 | *pValue = Collation::LEAD_MIXED; | |
1191 | return FALSE; | |
1192 | } | |
1193 | return TRUE; | |
1194 | } | |
1195 | ||
1196 | U_CDECL_END | |
1197 | ||
1198 | void | |
1199 | CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) { | |
1200 | for(UChar lead = 0xd800; lead < 0xdc00; ++lead) { | |
1201 | int32_t value = -1; | |
1202 | utrie2_enumForLeadSurrogate(trie, lead, NULL, enumRangeLeadValue, &value); | |
1203 | utrie2_set32ForLeadSurrogateCodeUnit( | |
1204 | trie, lead, | |
1205 | Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | (uint32_t)value, | |
1206 | &errorCode); | |
1207 | } | |
1208 | } | |
1209 | ||
1210 | void | |
1211 | CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) { | |
1212 | buildMappings(data, errorCode); | |
1213 | if(base != NULL) { | |
1214 | data.numericPrimary = base->numericPrimary; | |
1215 | data.compressibleBytes = base->compressibleBytes; | |
b331163b A |
1216 | data.numScripts = base->numScripts; |
1217 | data.scriptsIndex = base->scriptsIndex; | |
1218 | data.scriptStarts = base->scriptStarts; | |
1219 | data.scriptStartsLength = base->scriptStartsLength; | |
57a6839d A |
1220 | } |
1221 | buildFastLatinTable(data, errorCode); | |
1222 | } | |
1223 | ||
1224 | void | |
1225 | CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) { | |
1226 | if(U_FAILURE(errorCode)) { return; } | |
1227 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
1228 | errorCode = U_INVALID_STATE_ERROR; | |
1229 | return; | |
1230 | } | |
1231 | ||
1232 | buildContexts(errorCode); | |
1233 | ||
1234 | uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; | |
1235 | int32_t jamoIndex = -1; | |
1236 | if(getJamoCE32s(jamoCE32s, errorCode)) { | |
1237 | jamoIndex = ce32s.size(); | |
1238 | for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) { | |
1239 | ce32s.addElement((int32_t)jamoCE32s[i], errorCode); | |
1240 | } | |
1241 | // Small optimization: Use a bit in the Hangul ce32 | |
1242 | // to indicate that none of the Jamo CE32s are isSpecialCE32() | |
1243 | // (as it should be in the root collator). | |
1244 | // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. | |
1245 | // In order to still have good trie compression and keep this code simple, | |
1246 | // we only set this flag if a whole block of 588 Hangul syllables starting with | |
1247 | // a common leading consonant (Jamo L) has this property. | |
1248 | UBool isAnyJamoVTSpecial = FALSE; | |
1249 | for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) { | |
1250 | if(Collation::isSpecialCE32(jamoCE32s[i])) { | |
1251 | isAnyJamoVTSpecial = TRUE; | |
1252 | break; | |
1253 | } | |
1254 | } | |
1255 | uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); | |
1256 | UChar32 c = Hangul::HANGUL_BASE; | |
1257 | for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L | |
1258 | uint32_t ce32 = hangulCE32; | |
1259 | if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) { | |
1260 | ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO; | |
1261 | } | |
1262 | UChar32 limit = c + Hangul::JAMO_VT_COUNT; | |
1263 | utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); | |
1264 | c = limit; | |
1265 | } | |
1266 | } else { | |
1267 | // Copy the Hangul CE32s from the base in blocks per Jamo L, | |
1268 | // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. | |
1269 | for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) { | |
1270 | uint32_t ce32 = base->getCE32(c); | |
1271 | U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG)); | |
1272 | UChar32 limit = c + Hangul::JAMO_VT_COUNT; | |
1273 | utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); | |
1274 | c = limit; | |
1275 | } | |
1276 | } | |
1277 | ||
1278 | setDigitTags(errorCode); | |
1279 | setLeadSurrogates(errorCode); | |
1280 | ||
1281 | // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. | |
1282 | ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0); | |
1283 | utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode); | |
1284 | ||
1285 | utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode); | |
1286 | if(U_FAILURE(errorCode)) { return; } | |
1287 | ||
1288 | // Mark each lead surrogate as "unsafe" | |
1289 | // if any of its 1024 associated supplementary code points is "unsafe". | |
1290 | UChar32 c = 0x10000; | |
1291 | for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { | |
1292 | if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { | |
1293 | unsafeBackwardSet.add(lead); | |
1294 | } | |
1295 | } | |
1296 | unsafeBackwardSet.freeze(); | |
1297 | ||
1298 | data.trie = trie; | |
1299 | data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer()); | |
1300 | data.ces = ce64s.getBuffer(); | |
1301 | data.contexts = contexts.getBuffer(); | |
1302 | ||
1303 | data.ce32sLength = ce32s.size(); | |
1304 | data.cesLength = ce64s.size(); | |
1305 | data.contextsLength = contexts.length(); | |
1306 | ||
1307 | data.base = base; | |
1308 | if(jamoIndex >= 0) { | |
1309 | data.jamoCE32s = data.ce32s + jamoIndex; | |
1310 | } else { | |
1311 | data.jamoCE32s = base->jamoCE32s; | |
1312 | } | |
1313 | data.unsafeBackwardSet = &unsafeBackwardSet; | |
1314 | } | |
1315 | ||
1316 | void | |
1317 | CollationDataBuilder::clearContexts() { | |
1318 | contexts.remove(); | |
1319 | UnicodeSetIterator iter(contextChars); | |
1320 | while(iter.next()) { | |
1321 | U_ASSERT(!iter.isString()); | |
1322 | uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint()); | |
1323 | U_ASSERT(isBuilderContextCE32(ce32)); | |
1324 | getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32; | |
1325 | } | |
1326 | } | |
1327 | ||
1328 | void | |
1329 | CollationDataBuilder::buildContexts(UErrorCode &errorCode) { | |
1330 | if(U_FAILURE(errorCode)) { return; } | |
1331 | // Ignore abandoned lists and the cached builtCE32, | |
1332 | // and build all contexts from scratch. | |
1333 | contexts.remove(); | |
1334 | UnicodeSetIterator iter(contextChars); | |
1335 | while(U_SUCCESS(errorCode) && iter.next()) { | |
1336 | U_ASSERT(!iter.isString()); | |
1337 | UChar32 c = iter.getCodepoint(); | |
1338 | uint32_t ce32 = utrie2_get32(trie, c); | |
1339 | if(!isBuilderContextCE32(ce32)) { | |
1340 | // Impossible: No context data for c in contextChars. | |
1341 | errorCode = U_INTERNAL_PROGRAM_ERROR; | |
1342 | return; | |
1343 | } | |
1344 | ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32); | |
1345 | ce32 = buildContext(cond, errorCode); | |
1346 | utrie2_set32(trie, c, ce32, &errorCode); | |
1347 | } | |
1348 | } | |
1349 | ||
1350 | uint32_t | |
1351 | CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) { | |
1352 | if(U_FAILURE(errorCode)) { return 0; } | |
1353 | // The list head must have no context. | |
1354 | U_ASSERT(!head->hasContext()); | |
1355 | // The list head must be followed by one or more nodes that all do have context. | |
1356 | U_ASSERT(head->next >= 0); | |
1357 | UCharsTrieBuilder prefixBuilder(errorCode); | |
1358 | UCharsTrieBuilder contractionBuilder(errorCode); | |
1359 | for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) { | |
1360 | // After the list head, the prefix or suffix can be empty, but not both. | |
1361 | U_ASSERT(cond == head || cond->hasContext()); | |
1362 | int32_t prefixLength = cond->prefixLength(); | |
1363 | UnicodeString prefix(cond->context, 0, prefixLength + 1); | |
1364 | // Collect all contraction suffixes for one prefix. | |
1365 | ConditionalCE32 *firstCond = cond; | |
1366 | ConditionalCE32 *lastCond = cond; | |
1367 | while(cond->next >= 0 && | |
1368 | (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) { | |
1369 | lastCond = cond; | |
1370 | } | |
1371 | uint32_t ce32; | |
1372 | int32_t suffixStart = prefixLength + 1; // == prefix.length() | |
1373 | if(lastCond->context.length() == suffixStart) { | |
1374 | // One prefix without contraction suffix. | |
1375 | U_ASSERT(firstCond == lastCond); | |
1376 | ce32 = lastCond->ce32; | |
1377 | cond = lastCond; | |
1378 | } else { | |
1379 | // Build the contractions trie. | |
1380 | contractionBuilder.clear(); | |
1381 | // Entry for an empty suffix, to be stored before the trie. | |
b331163b | 1382 | uint32_t emptySuffixCE32 = 0; |
57a6839d A |
1383 | uint32_t flags = 0; |
1384 | if(firstCond->context.length() == suffixStart) { | |
1385 | // There is a mapping for the prefix and the single character c. (p|c) | |
1386 | // If no other suffix matches, then we return this value. | |
1387 | emptySuffixCE32 = firstCond->ce32; | |
1388 | cond = getConditionalCE32(firstCond->next); | |
1389 | } else { | |
1390 | // There is no mapping for the prefix and just the single character. | |
1391 | // (There is no p|c, only p|cd, p|ce etc.) | |
1392 | flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH; | |
1393 | // When the prefix matches but none of the prefix-specific suffixes, | |
1394 | // then we fall back to the mappings with the next-longest prefix, | |
1395 | // and ultimately to mappings with no prefix. | |
1396 | // Each fallback might be another set of contractions. | |
1397 | // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, | |
1398 | // then in text "pch" we find the ch contraction. | |
1399 | for(cond = head;; cond = getConditionalCE32(cond->next)) { | |
1400 | int32_t length = cond->prefixLength(); | |
1401 | if(length == prefixLength) { break; } | |
1402 | if(cond->defaultCE32 != Collation::NO_CE32 && | |
1403 | (length==0 || prefix.endsWith(cond->context, 1, length))) { | |
1404 | emptySuffixCE32 = cond->defaultCE32; | |
1405 | } | |
1406 | } | |
1407 | cond = firstCond; | |
1408 | } | |
1409 | // Optimization: Set a flag when | |
1410 | // the first character of every contraction suffix has lccc!=0. | |
1411 | // Short-circuits contraction matching when a normal letter follows. | |
1412 | flags |= Collation::CONTRACT_NEXT_CCC; | |
1413 | // Add all of the non-empty suffixes into the contraction trie. | |
1414 | for(;;) { | |
1415 | UnicodeString suffix(cond->context, suffixStart); | |
1416 | uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0)); | |
1417 | if(fcd16 <= 0xff) { | |
1418 | flags &= ~Collation::CONTRACT_NEXT_CCC; | |
1419 | } | |
1420 | fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1)); | |
1421 | if(fcd16 > 0xff) { | |
1422 | // The last suffix character has lccc!=0, allowing for discontiguous contractions. | |
1423 | flags |= Collation::CONTRACT_TRAILING_CCC; | |
1424 | } | |
1425 | contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode); | |
1426 | if(cond == lastCond) { break; } | |
1427 | cond = getConditionalCE32(cond->next); | |
1428 | } | |
1429 | int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode); | |
1430 | if(U_FAILURE(errorCode)) { return 0; } | |
1431 | if(index > Collation::MAX_INDEX) { | |
1432 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
1433 | return 0; | |
1434 | } | |
1435 | ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags; | |
1436 | } | |
1437 | U_ASSERT(cond == lastCond); | |
1438 | firstCond->defaultCE32 = ce32; | |
1439 | if(prefixLength == 0) { | |
1440 | if(cond->next < 0) { | |
1441 | // No non-empty prefixes, only contractions. | |
1442 | return ce32; | |
1443 | } | |
1444 | } else { | |
1445 | prefix.remove(0, 1); // Remove the length unit. | |
1446 | prefix.reverse(); | |
1447 | prefixBuilder.add(prefix, (int32_t)ce32, errorCode); | |
1448 | if(cond->next < 0) { break; } | |
1449 | } | |
1450 | } | |
1451 | U_ASSERT(head->defaultCE32 != Collation::NO_CE32); | |
1452 | int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode); | |
1453 | if(U_FAILURE(errorCode)) { return 0; } | |
1454 | if(index > Collation::MAX_INDEX) { | |
1455 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
1456 | return 0; | |
1457 | } | |
1458 | return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index); | |
1459 | } | |
1460 | ||
1461 | int32_t | |
1462 | CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, | |
1463 | UErrorCode &errorCode) { | |
1464 | UnicodeString context; | |
1465 | context.append((UChar)(defaultCE32 >> 16)).append((UChar)defaultCE32); | |
1466 | UnicodeString trieString; | |
1467 | context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode)); | |
1468 | if(U_FAILURE(errorCode)) { return -1; } | |
1469 | int32_t index = contexts.indexOf(context); | |
1470 | if(index < 0) { | |
1471 | index = contexts.length(); | |
1472 | contexts.append(context); | |
1473 | } | |
1474 | return index; | |
1475 | } | |
1476 | ||
1477 | void | |
1478 | CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) { | |
1479 | if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; } | |
1480 | ||
1481 | delete fastLatinBuilder; | |
1482 | fastLatinBuilder = new CollationFastLatinBuilder(errorCode); | |
1483 | if(fastLatinBuilder == NULL) { | |
1484 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
1485 | return; | |
1486 | } | |
1487 | if(fastLatinBuilder->forData(data, errorCode)) { | |
1488 | const uint16_t *table = fastLatinBuilder->getTable(); | |
1489 | int32_t length = fastLatinBuilder->lengthOfTable(); | |
1490 | if(base != NULL && length == base->fastLatinTableLength && | |
1491 | uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) { | |
1492 | // Same fast Latin table as in the base, use that one instead. | |
1493 | delete fastLatinBuilder; | |
1494 | fastLatinBuilder = NULL; | |
1495 | table = base->fastLatinTable; | |
1496 | } | |
1497 | data.fastLatinTable = table; | |
1498 | data.fastLatinTableLength = length; | |
1499 | } else { | |
1500 | delete fastLatinBuilder; | |
1501 | fastLatinBuilder = NULL; | |
1502 | } | |
1503 | } | |
1504 | ||
1505 | int32_t | |
1506 | CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) { | |
1507 | return getCEs(s, 0, ces, cesLength); | |
1508 | } | |
1509 | ||
1510 | int32_t | |
1511 | CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s, | |
1512 | int64_t ces[], int32_t cesLength) { | |
1513 | int32_t prefixLength = prefix.length(); | |
1514 | if(prefixLength == 0) { | |
1515 | return getCEs(s, 0, ces, cesLength); | |
1516 | } else { | |
1517 | return getCEs(prefix + s, prefixLength, ces, cesLength); | |
1518 | } | |
1519 | } | |
1520 | ||
1521 | int32_t | |
1522 | CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start, | |
1523 | int64_t ces[], int32_t cesLength) { | |
1524 | if(collIter == NULL) { | |
1525 | collIter = new DataBuilderCollationIterator(*this); | |
1526 | if(collIter == NULL) { return 0; } | |
1527 | } | |
1528 | return collIter->fetchCEs(s, start, ces, cesLength); | |
1529 | } | |
1530 | ||
1531 | U_NAMESPACE_END | |
1532 | ||
1533 | #endif // !UCONFIG_NO_COLLATION |