]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
57a6839d A |
3 | /* |
4 | ******************************************************************************* | |
b331163b | 5 | * Copyright (C) 2012-2015, International Business Machines |
57a6839d A |
6 | * Corporation and others. All Rights Reserved. |
7 | ******************************************************************************* | |
8 | * collationdatabuilder.cpp | |
9 | * | |
10 | * (replaced the former ucol_elm.cpp) | |
11 | * | |
12 | * created on: 2012apr01 | |
13 | * created by: Markus W. Scherer | |
14 | */ | |
15 | ||
16 | #include "unicode/utypes.h" | |
17 | ||
18 | #if !UCONFIG_NO_COLLATION | |
19 | ||
20 | #include "unicode/localpointer.h" | |
21 | #include "unicode/uchar.h" | |
22 | #include "unicode/ucharstrie.h" | |
23 | #include "unicode/ucharstriebuilder.h" | |
24 | #include "unicode/uniset.h" | |
25 | #include "unicode/unistr.h" | |
26 | #include "unicode/usetiter.h" | |
27 | #include "unicode/utf16.h" | |
28 | #include "cmemory.h" | |
29 | #include "collation.h" | |
30 | #include "collationdata.h" | |
31 | #include "collationdatabuilder.h" | |
32 | #include "collationfastlatinbuilder.h" | |
33 | #include "collationiterator.h" | |
34 | #include "normalizer2impl.h" | |
35 | #include "utrie2.h" | |
36 | #include "uvectr32.h" | |
37 | #include "uvectr64.h" | |
38 | #include "uvector.h" | |
39 | ||
57a6839d A |
40 | U_NAMESPACE_BEGIN |
41 | ||
42 | CollationDataBuilder::CEModifier::~CEModifier() {} | |
43 | ||
44 | /** | |
45 | * Build-time context and CE32 for a code point. | |
46 | * If a code point has contextual mappings, then the default (no-context) mapping | |
47 | * and all conditional mappings are stored in a singly-linked list | |
48 | * of ConditionalCE32, sorted by context strings. | |
49 | * | |
50 | * Context strings sort by prefix length, then by prefix, then by contraction suffix. | |
51 | * Context strings must be unique and in ascending order. | |
52 | */ | |
53 | struct ConditionalCE32 : public UMemory { | |
b331163b A |
54 | ConditionalCE32() |
55 | : context(), | |
56 | ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), | |
57 | next(-1) {} | |
57a6839d A |
58 | ConditionalCE32(const UnicodeString &ct, uint32_t ce) |
59 | : context(ct), | |
60 | ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), | |
61 | next(-1) {} | |
62 | ||
63 | inline UBool hasContext() const { return context.length() > 1; } | |
64 | inline int32_t prefixLength() const { return context.charAt(0); } | |
65 | ||
66 | /** | |
67 | * "\0" for the first entry for any code point, with its default CE32. | |
68 | * | |
69 | * Otherwise one unit with the length of the prefix string, | |
70 | * then the prefix string, then the contraction suffix. | |
71 | */ | |
72 | UnicodeString context; | |
73 | /** | |
74 | * CE32 for the code point and its context. | |
75 | * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). | |
76 | */ | |
77 | uint32_t ce32; | |
78 | /** | |
79 | * Default CE32 for all contexts with this same prefix. | |
80 | * Initially NO_CE32. Set only while building runtime data structures, | |
81 | * and only on one of the nodes of a sub-list with the same prefix. | |
82 | */ | |
83 | uint32_t defaultCE32; | |
84 | /** | |
85 | * CE32 for the built contexts. | |
86 | * When fetching CEs from the builder, the contexts are built into their runtime form | |
87 | * so that the normal collation implementation can process them. | |
88 | * The result is cached in the list head. It is reset when the contexts are modified. | |
89 | */ | |
90 | uint32_t builtCE32; | |
91 | /** | |
92 | * Index of the next ConditionalCE32. | |
93 | * Negative for the end of the list. | |
94 | */ | |
95 | int32_t next; | |
96 | }; | |
97 | ||
98 | U_CDECL_BEGIN | |
99 | ||
100 | U_CAPI void U_CALLCONV | |
101 | uprv_deleteConditionalCE32(void *obj) { | |
102 | delete static_cast<ConditionalCE32 *>(obj); | |
103 | } | |
104 | ||
105 | U_CDECL_END | |
106 | ||
107 | /** | |
108 | * Build-time collation element and character iterator. | |
109 | * Uses the runtime CollationIterator for fetching CEs for a string | |
110 | * but reads from the builder's unfinished data structures. | |
111 | * In particular, this class reads from the unfinished trie | |
112 | * and has to avoid CollationIterator::nextCE() and redirect other | |
113 | * calls to data->getCE32() and data->getCE32FromSupplementary(). | |
114 | * | |
115 | * We do this so that we need not implement the collation algorithm | |
116 | * again for the builder and make it behave exactly like the runtime code. | |
117 | * That would be more difficult to test and maintain than this indirection. | |
118 | * | |
119 | * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, | |
120 | * so the data accesses from those code paths need not be modified. | |
121 | * | |
122 | * This class iterates directly over whole code points | |
123 | * so that the CollationIterator does not need the finished trie | |
124 | * for handling the LEAD_SURROGATE_TAG. | |
125 | */ | |
126 | class DataBuilderCollationIterator : public CollationIterator { | |
127 | public: | |
128 | DataBuilderCollationIterator(CollationDataBuilder &b); | |
129 | ||
130 | virtual ~DataBuilderCollationIterator(); | |
131 | ||
132 | int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength); | |
133 | ||
134 | virtual void resetToOffset(int32_t newOffset); | |
135 | virtual int32_t getOffset() const; | |
136 | ||
137 | virtual UChar32 nextCodePoint(UErrorCode &errorCode); | |
138 | virtual UChar32 previousCodePoint(UErrorCode &errorCode); | |
139 | ||
140 | protected: | |
141 | virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode); | |
142 | virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode); | |
143 | ||
144 | virtual uint32_t getDataCE32(UChar32 c) const; | |
145 | virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode); | |
146 | ||
147 | CollationDataBuilder &builder; | |
148 | CollationData builderData; | |
149 | uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; | |
150 | const UnicodeString *s; | |
151 | int32_t pos; | |
152 | }; | |
153 | ||
154 | DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b) | |
155 | : CollationIterator(&builderData, /*numeric=*/ FALSE), | |
156 | builder(b), builderData(b.nfcImpl), | |
157 | s(NULL), pos(0) { | |
158 | builderData.base = builder.base; | |
159 | // Set all of the jamoCE32s[] to indirection CE32s. | |
160 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. | |
161 | UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j); | |
162 | jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) | | |
163 | CollationDataBuilder::IS_BUILDER_JAMO_CE32; | |
164 | } | |
165 | builderData.jamoCE32s = jamoCE32s; | |
166 | } | |
167 | ||
168 | DataBuilderCollationIterator::~DataBuilderCollationIterator() {} | |
169 | ||
170 | int32_t | |
171 | DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start, | |
172 | int64_t ces[], int32_t cesLength) { | |
173 | // Set the pointers each time, in case they changed due to reallocation. | |
174 | builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer()); | |
175 | builderData.ces = builder.ce64s.getBuffer(); | |
176 | builderData.contexts = builder.contexts.getBuffer(); | |
177 | // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32(). | |
178 | reset(); | |
179 | s = &str; | |
180 | pos = start; | |
181 | UErrorCode errorCode = U_ZERO_ERROR; | |
182 | while(U_SUCCESS(errorCode) && pos < s->length()) { | |
183 | // No need to keep all CEs in the iterator buffer. | |
184 | clearCEs(); | |
185 | UChar32 c = s->char32At(pos); | |
186 | pos += U16_LENGTH(c); | |
187 | uint32_t ce32 = utrie2_get32(builder.trie, c); | |
188 | const CollationData *d; | |
189 | if(ce32 == Collation::FALLBACK_CE32) { | |
190 | d = builder.base; | |
191 | ce32 = builder.base->getCE32(c); | |
192 | } else { | |
193 | d = &builderData; | |
194 | } | |
195 | appendCEsFromCE32(d, c, ce32, /*forward=*/ TRUE, errorCode); | |
196 | U_ASSERT(U_SUCCESS(errorCode)); | |
197 | for(int32_t i = 0; i < getCEsLength(); ++i) { | |
198 | int64_t ce = getCE(i); | |
199 | if(ce != 0) { | |
200 | if(cesLength < Collation::MAX_EXPANSION_LENGTH) { | |
201 | ces[cesLength] = ce; | |
202 | } | |
203 | ++cesLength; | |
204 | } | |
205 | } | |
206 | } | |
207 | return cesLength; | |
208 | } | |
209 | ||
210 | void | |
211 | DataBuilderCollationIterator::resetToOffset(int32_t newOffset) { | |
212 | reset(); | |
213 | pos = newOffset; | |
214 | } | |
215 | ||
216 | int32_t | |
217 | DataBuilderCollationIterator::getOffset() const { | |
218 | return pos; | |
219 | } | |
220 | ||
221 | UChar32 | |
222 | DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { | |
223 | if(pos == s->length()) { | |
224 | return U_SENTINEL; | |
225 | } | |
226 | UChar32 c = s->char32At(pos); | |
227 | pos += U16_LENGTH(c); | |
228 | return c; | |
229 | } | |
230 | ||
231 | UChar32 | |
232 | DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { | |
233 | if(pos == 0) { | |
234 | return U_SENTINEL; | |
235 | } | |
236 | UChar32 c = s->char32At(pos - 1); | |
237 | pos -= U16_LENGTH(c); | |
238 | return c; | |
239 | } | |
240 | ||
241 | void | |
242 | DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { | |
243 | pos = s->moveIndex32(pos, num); | |
244 | } | |
245 | ||
246 | void | |
247 | DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { | |
248 | pos = s->moveIndex32(pos, -num); | |
249 | } | |
250 | ||
251 | uint32_t | |
252 | DataBuilderCollationIterator::getDataCE32(UChar32 c) const { | |
253 | return utrie2_get32(builder.trie, c); | |
254 | } | |
255 | ||
256 | uint32_t | |
257 | DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) { | |
258 | U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG)); | |
259 | if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) { | |
260 | UChar32 jamo = Collation::indexFromCE32(ce32); | |
261 | return utrie2_get32(builder.trie, jamo); | |
262 | } else { | |
263 | ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32); | |
264 | if(cond->builtCE32 == Collation::NO_CE32) { | |
265 | // Build the context-sensitive mappings into their runtime form and cache the result. | |
266 | cond->builtCE32 = builder.buildContext(cond, errorCode); | |
267 | if(errorCode == U_BUFFER_OVERFLOW_ERROR) { | |
268 | errorCode = U_ZERO_ERROR; | |
269 | builder.clearContexts(); | |
270 | cond->builtCE32 = builder.buildContext(cond, errorCode); | |
271 | } | |
272 | builderData.contexts = builder.contexts.getBuffer(); | |
273 | } | |
274 | return cond->builtCE32; | |
275 | } | |
276 | } | |
277 | ||
278 | // ------------------------------------------------------------------------- *** | |
279 | ||
280 | CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode) | |
281 | : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), | |
282 | base(NULL), baseSettings(NULL), | |
283 | trie(NULL), | |
284 | ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode), | |
285 | modified(FALSE), | |
286 | fastLatinEnabled(FALSE), fastLatinBuilder(NULL), | |
287 | collIter(NULL) { | |
288 | // Reserve the first CE32 for U+0000. | |
289 | ce32s.addElement(0, errorCode); | |
290 | conditionalCE32s.setDeleter(uprv_deleteConditionalCE32); | |
291 | } | |
292 | ||
293 | CollationDataBuilder::~CollationDataBuilder() { | |
294 | utrie2_close(trie); | |
295 | delete fastLatinBuilder; | |
296 | delete collIter; | |
297 | } | |
298 | ||
299 | void | |
300 | CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) { | |
301 | if(U_FAILURE(errorCode)) { return; } | |
302 | if(trie != NULL) { | |
303 | errorCode = U_INVALID_STATE_ERROR; | |
304 | return; | |
305 | } | |
306 | if(b == NULL) { | |
307 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
308 | return; | |
309 | } | |
310 | base = b; | |
311 | ||
312 | // For a tailoring, the default is to fall back to the base. | |
313 | trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode); | |
314 | ||
315 | // Set the Latin-1 letters block so that it is allocated first in the data array, | |
316 | // to try to improve locality of reference when sorting Latin-1 text. | |
317 | // Do not use utrie2_setRange32() since that will not actually allocate blocks | |
318 | // that are filled with the default value. | |
319 | // ASCII (0..7F) is already preallocated anyway. | |
320 | for(UChar32 c = 0xc0; c <= 0xff; ++c) { | |
321 | utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode); | |
322 | } | |
323 | ||
324 | // Hangul syllables are not tailorable (except via tailoring Jamos). | |
325 | // Always set the Hangul tag to help performance. | |
326 | // Do this here, rather than in buildMappings(), | |
327 | // so that we see the HANGUL_TAG in various assertions. | |
328 | uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); | |
329 | utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode); | |
330 | ||
331 | // Copy the set contents but don't copy/clone the set as a whole because | |
332 | // that would copy the isFrozen state too. | |
333 | unsafeBackwardSet.addAll(*b->unsafeBackwardSet); | |
334 | ||
335 | if(U_FAILURE(errorCode)) { return; } | |
336 | } | |
337 | ||
338 | UBool | |
339 | CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end, | |
340 | uint32_t primary, int32_t step, | |
341 | UErrorCode &errorCode) { | |
342 | if(U_FAILURE(errorCode)) { return FALSE; } | |
343 | U_ASSERT(start <= end); | |
344 | // TODO: Do we need to check what values are currently set for start..end? | |
345 | // An offset range is worth it only if we can achieve an overlap between | |
346 | // adjacent UTrie2 blocks of 32 code points each. | |
347 | // An offset CE is also a little more expensive to look up and compute | |
348 | // than a simple CE. | |
349 | // If the range spans at least three UTrie2 block boundaries (> 64 code points), | |
350 | // then we take it. | |
351 | // If the range spans one or two block boundaries and there are | |
352 | // at least 4 code points on either side, then we take it. | |
353 | // (We could additionally require a minimum range length of, say, 16.) | |
354 | int32_t blockDelta = (end >> 5) - (start >> 5); | |
355 | if(2 <= step && step <= 0x7f && | |
356 | (blockDelta >= 3 || | |
357 | (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) { | |
358 | int64_t dataCE = ((int64_t)primary << 32) | (start << 8) | step; | |
359 | if(isCompressiblePrimary(primary)) { dataCE |= 0x80; } | |
360 | int32_t index = addCE(dataCE, errorCode); | |
361 | if(U_FAILURE(errorCode)) { return 0; } | |
362 | if(index > Collation::MAX_INDEX) { | |
363 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
364 | return 0; | |
365 | } | |
366 | uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index); | |
367 | utrie2_setRange32(trie, start, end, offsetCE32, TRUE, &errorCode); | |
368 | modified = TRUE; | |
369 | return TRUE; | |
370 | } else { | |
371 | return FALSE; | |
372 | } | |
373 | } | |
374 | ||
375 | uint32_t | |
376 | CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, | |
377 | uint32_t primary, int32_t step, | |
378 | UErrorCode &errorCode) { | |
379 | if(U_FAILURE(errorCode)) { return 0; } | |
380 | UBool isCompressible = isCompressiblePrimary(primary); | |
381 | if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) { | |
382 | return Collation::incThreeBytePrimaryByOffset(primary, isCompressible, | |
383 | (end - start + 1) * step); | |
384 | } else { | |
385 | // Short range: Set individual CE32s. | |
386 | for(;;) { | |
387 | utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode); | |
388 | ++start; | |
389 | primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step); | |
390 | if(start > end) { return primary; } | |
391 | } | |
392 | modified = TRUE; | |
393 | } | |
394 | } | |
395 | ||
396 | uint32_t | |
397 | CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const { | |
398 | int32_t i = Collation::indexFromCE32(ce32); | |
399 | int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i); | |
400 | uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); | |
401 | return Collation::makeLongPrimaryCE32(p); | |
402 | } | |
403 | ||
404 | UBool | |
405 | CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const { | |
406 | return base->isCompressibleLeadByte(b); | |
407 | } | |
408 | ||
409 | UBool | |
410 | CollationDataBuilder::isAssigned(UChar32 c) const { | |
411 | return Collation::isAssignedCE32(utrie2_get32(trie, c)); | |
412 | } | |
413 | ||
414 | uint32_t | |
415 | CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const { | |
416 | uint32_t ce32 = utrie2_get32(trie, c); | |
417 | if(Collation::isLongPrimaryCE32(ce32)) { | |
418 | return Collation::primaryFromLongPrimaryCE32(ce32); | |
419 | } else { | |
420 | return 0; | |
421 | } | |
422 | } | |
423 | ||
424 | int64_t | |
425 | CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const { | |
426 | if(U_FAILURE(errorCode)) { return 0; } | |
b331163b | 427 | // Keep parallel with CollationData::getSingleCE(). |
57a6839d A |
428 | UBool fromBase = FALSE; |
429 | uint32_t ce32 = utrie2_get32(trie, c); | |
430 | if(ce32 == Collation::FALLBACK_CE32) { | |
431 | fromBase = TRUE; | |
432 | ce32 = base->getCE32(c); | |
433 | } | |
434 | while(Collation::isSpecialCE32(ce32)) { | |
435 | switch(Collation::tagFromCE32(ce32)) { | |
436 | case Collation::LATIN_EXPANSION_TAG: | |
437 | case Collation::BUILDER_DATA_TAG: | |
438 | case Collation::PREFIX_TAG: | |
439 | case Collation::CONTRACTION_TAG: | |
440 | case Collation::HANGUL_TAG: | |
441 | case Collation::LEAD_SURROGATE_TAG: | |
442 | errorCode = U_UNSUPPORTED_ERROR; | |
443 | return 0; | |
444 | case Collation::FALLBACK_TAG: | |
445 | case Collation::RESERVED_TAG_3: | |
446 | errorCode = U_INTERNAL_PROGRAM_ERROR; | |
447 | return 0; | |
448 | case Collation::LONG_PRIMARY_TAG: | |
449 | return Collation::ceFromLongPrimaryCE32(ce32); | |
450 | case Collation::LONG_SECONDARY_TAG: | |
451 | return Collation::ceFromLongSecondaryCE32(ce32); | |
452 | case Collation::EXPANSION32_TAG: | |
453 | if(Collation::lengthFromCE32(ce32) == 1) { | |
454 | int32_t i = Collation::indexFromCE32(ce32); | |
455 | ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i); | |
456 | break; | |
457 | } else { | |
458 | errorCode = U_UNSUPPORTED_ERROR; | |
459 | return 0; | |
460 | } | |
461 | case Collation::EXPANSION_TAG: { | |
462 | if(Collation::lengthFromCE32(ce32) == 1) { | |
463 | int32_t i = Collation::indexFromCE32(ce32); | |
464 | return fromBase ? base->ces[i] : ce64s.elementAti(i); | |
465 | } else { | |
466 | errorCode = U_UNSUPPORTED_ERROR; | |
467 | return 0; | |
468 | } | |
469 | } | |
470 | case Collation::DIGIT_TAG: | |
471 | // Fetch the non-numeric-collation CE32 and continue. | |
472 | ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32)); | |
473 | break; | |
474 | case Collation::U0000_TAG: | |
475 | U_ASSERT(c == 0); | |
476 | // Fetch the normal ce32 for U+0000 and continue. | |
477 | ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0); | |
478 | break; | |
479 | case Collation::OFFSET_TAG: | |
480 | ce32 = getCE32FromOffsetCE32(fromBase, c, ce32); | |
481 | break; | |
482 | case Collation::IMPLICIT_TAG: | |
483 | return Collation::unassignedCEFromCodePoint(c); | |
484 | } | |
485 | } | |
486 | return Collation::ceFromSimpleCE32(ce32); | |
487 | } | |
488 | ||
489 | int32_t | |
490 | CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) { | |
491 | int32_t length = ce64s.size(); | |
492 | for(int32_t i = 0; i < length; ++i) { | |
493 | if(ce == ce64s.elementAti(i)) { return i; } | |
494 | } | |
495 | ce64s.addElement(ce, errorCode); | |
496 | return length; | |
497 | } | |
498 | ||
499 | int32_t | |
500 | CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) { | |
501 | int32_t length = ce32s.size(); | |
502 | for(int32_t i = 0; i < length; ++i) { | |
503 | if(ce32 == (uint32_t)ce32s.elementAti(i)) { return i; } | |
504 | } | |
505 | ce32s.addElement((int32_t)ce32, errorCode); | |
506 | return length; | |
507 | } | |
508 | ||
509 | int32_t | |
510 | CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32, | |
511 | UErrorCode &errorCode) { | |
512 | if(U_FAILURE(errorCode)) { return -1; } | |
513 | U_ASSERT(!context.isEmpty()); | |
514 | int32_t index = conditionalCE32s.size(); | |
515 | if(index > Collation::MAX_INDEX) { | |
516 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
517 | return -1; | |
518 | } | |
519 | ConditionalCE32 *cond = new ConditionalCE32(context, ce32); | |
520 | if(cond == NULL) { | |
521 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
522 | return -1; | |
523 | } | |
524 | conditionalCE32s.addElement(cond, errorCode); | |
525 | return index; | |
526 | } | |
527 | ||
528 | void | |
529 | CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s, | |
530 | const int64_t ces[], int32_t cesLength, | |
531 | UErrorCode &errorCode) { | |
532 | uint32_t ce32 = encodeCEs(ces, cesLength, errorCode); | |
533 | addCE32(prefix, s, ce32, errorCode); | |
534 | } | |
535 | ||
536 | void | |
537 | CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s, | |
538 | uint32_t ce32, UErrorCode &errorCode) { | |
539 | if(U_FAILURE(errorCode)) { return; } | |
540 | if(s.isEmpty()) { | |
541 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
542 | return; | |
543 | } | |
544 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
545 | errorCode = U_INVALID_STATE_ERROR; | |
546 | return; | |
547 | } | |
548 | UChar32 c = s.char32At(0); | |
549 | int32_t cLength = U16_LENGTH(c); | |
550 | uint32_t oldCE32 = utrie2_get32(trie, c); | |
551 | UBool hasContext = !prefix.isEmpty() || s.length() > cLength; | |
552 | if(oldCE32 == Collation::FALLBACK_CE32) { | |
553 | // First tailoring for c. | |
554 | // If c has contextual base mappings or if we add a contextual mapping, | |
555 | // then copy the base mappings. | |
556 | // Otherwise we just override the base mapping. | |
557 | uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c)); | |
558 | if(hasContext || Collation::ce32HasContext(baseCE32)) { | |
559 | oldCE32 = copyFromBaseCE32(c, baseCE32, TRUE, errorCode); | |
560 | utrie2_set32(trie, c, oldCE32, &errorCode); | |
561 | if(U_FAILURE(errorCode)) { return; } | |
562 | } | |
563 | } | |
564 | if(!hasContext) { | |
565 | // No prefix, no contraction. | |
566 | if(!isBuilderContextCE32(oldCE32)) { | |
567 | utrie2_set32(trie, c, ce32, &errorCode); | |
568 | } else { | |
569 | ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32); | |
570 | cond->builtCE32 = Collation::NO_CE32; | |
571 | cond->ce32 = ce32; | |
572 | } | |
573 | } else { | |
574 | ConditionalCE32 *cond; | |
575 | if(!isBuilderContextCE32(oldCE32)) { | |
576 | // Replace the simple oldCE32 with a builder context CE32 | |
577 | // pointing to a new ConditionalCE32 list head. | |
578 | int32_t index = addConditionalCE32(UnicodeString((UChar)0), oldCE32, errorCode); | |
579 | if(U_FAILURE(errorCode)) { return; } | |
580 | uint32_t contextCE32 = makeBuilderContextCE32(index); | |
581 | utrie2_set32(trie, c, contextCE32, &errorCode); | |
582 | contextChars.add(c); | |
583 | cond = getConditionalCE32(index); | |
584 | } else { | |
585 | cond = getConditionalCE32ForCE32(oldCE32); | |
586 | cond->builtCE32 = Collation::NO_CE32; | |
587 | } | |
588 | UnicodeString suffix(s, cLength); | |
589 | UnicodeString context((UChar)prefix.length()); | |
590 | context.append(prefix).append(suffix); | |
591 | unsafeBackwardSet.addAll(suffix); | |
592 | for(;;) { | |
593 | // invariant: context > cond->context | |
594 | int32_t next = cond->next; | |
595 | if(next < 0) { | |
596 | // Append a new ConditionalCE32 after cond. | |
597 | int32_t index = addConditionalCE32(context, ce32, errorCode); | |
598 | if(U_FAILURE(errorCode)) { return; } | |
599 | cond->next = index; | |
600 | break; | |
601 | } | |
602 | ConditionalCE32 *nextCond = getConditionalCE32(next); | |
603 | int8_t cmp = context.compare(nextCond->context); | |
604 | if(cmp < 0) { | |
605 | // Insert a new ConditionalCE32 between cond and nextCond. | |
606 | int32_t index = addConditionalCE32(context, ce32, errorCode); | |
607 | if(U_FAILURE(errorCode)) { return; } | |
608 | cond->next = index; | |
609 | getConditionalCE32(index)->next = next; | |
610 | break; | |
611 | } else if(cmp == 0) { | |
612 | // Same context as before, overwrite its ce32. | |
613 | nextCond->ce32 = ce32; | |
614 | break; | |
615 | } | |
616 | cond = nextCond; | |
617 | } | |
618 | } | |
619 | modified = TRUE; | |
620 | } | |
621 | ||
622 | uint32_t | |
623 | CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) { | |
624 | uint32_t p = (uint32_t)(ce >> 32); | |
625 | uint32_t lower32 = (uint32_t)ce; | |
626 | uint32_t t = (uint32_t)(ce & 0xffff); | |
627 | U_ASSERT((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. | |
628 | if((ce & INT64_C(0xffff00ff00ff)) == 0) { | |
629 | // normal form ppppsstt | |
630 | return p | (lower32 >> 16) | (t >> 8); | |
631 | } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) { | |
632 | // long-primary form ppppppC1 | |
633 | return Collation::makeLongPrimaryCE32(p); | |
634 | } else if(p == 0 && (t & 0xff) == 0) { | |
635 | // long-secondary form ssssttC2 | |
636 | return Collation::makeLongSecondaryCE32(lower32); | |
637 | } | |
638 | return Collation::NO_CE32; | |
639 | } | |
640 | ||
641 | uint32_t | |
642 | CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) { | |
643 | // Try to encode one CE as one CE32. | |
644 | uint32_t ce32 = encodeOneCEAsCE32(ce); | |
645 | if(ce32 != Collation::NO_CE32) { return ce32; } | |
646 | int32_t index = addCE(ce, errorCode); | |
647 | if(U_FAILURE(errorCode)) { return 0; } | |
648 | if(index > Collation::MAX_INDEX) { | |
649 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
650 | return 0; | |
651 | } | |
652 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1); | |
653 | } | |
654 | ||
655 | uint32_t | |
656 | CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, | |
657 | UErrorCode &errorCode) { | |
658 | if(U_FAILURE(errorCode)) { return 0; } | |
659 | if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) { | |
660 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
661 | return 0; | |
662 | } | |
663 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
664 | errorCode = U_INVALID_STATE_ERROR; | |
665 | return 0; | |
666 | } | |
667 | if(cesLength == 0) { | |
668 | // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. | |
669 | // Do this here so that callers need not do it. | |
670 | return encodeOneCEAsCE32(0); | |
671 | } else if(cesLength == 1) { | |
672 | return encodeOneCE(ces[0], errorCode); | |
673 | } else if(cesLength == 2) { | |
674 | // Try to encode two CEs as one CE32. | |
675 | int64_t ce0 = ces[0]; | |
676 | int64_t ce1 = ces[1]; | |
677 | uint32_t p0 = (uint32_t)(ce0 >> 32); | |
678 | if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE && | |
679 | (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE && | |
680 | p0 != 0) { | |
681 | // Latin mini expansion | |
682 | return | |
683 | p0 | | |
684 | (((uint32_t)ce0 & 0xff00u) << 8) | | |
685 | (uint32_t)(ce1 >> 16) | | |
686 | Collation::SPECIAL_CE32_LOW_BYTE | | |
687 | Collation::LATIN_EXPANSION_TAG; | |
688 | } | |
689 | } | |
690 | // Try to encode two or more CEs as CE32s. | |
691 | int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH]; | |
692 | for(int32_t i = 0;; ++i) { | |
693 | if(i == cesLength) { | |
694 | return encodeExpansion32(newCE32s, cesLength, errorCode); | |
695 | } | |
696 | uint32_t ce32 = encodeOneCEAsCE32(ces[i]); | |
697 | if(ce32 == Collation::NO_CE32) { break; } | |
698 | newCE32s[i] = (int32_t)ce32; | |
699 | } | |
700 | return encodeExpansion(ces, cesLength, errorCode); | |
701 | } | |
702 | ||
703 | uint32_t | |
704 | CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) { | |
705 | if(U_FAILURE(errorCode)) { return 0; } | |
706 | // See if this sequence of CEs has already been stored. | |
707 | int64_t first = ces[0]; | |
708 | int32_t ce64sMax = ce64s.size() - length; | |
709 | for(int32_t i = 0; i <= ce64sMax; ++i) { | |
710 | if(first == ce64s.elementAti(i)) { | |
711 | if(i > Collation::MAX_INDEX) { | |
712 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
713 | return 0; | |
714 | } | |
715 | for(int32_t j = 1;; ++j) { | |
716 | if(j == length) { | |
717 | return Collation::makeCE32FromTagIndexAndLength( | |
718 | Collation::EXPANSION_TAG, i, length); | |
719 | } | |
720 | if(ce64s.elementAti(i + j) != ces[j]) { break; } | |
721 | } | |
722 | } | |
723 | } | |
724 | // Store the new sequence. | |
725 | int32_t i = ce64s.size(); | |
726 | if(i > Collation::MAX_INDEX) { | |
727 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
728 | return 0; | |
729 | } | |
730 | for(int32_t j = 0; j < length; ++j) { | |
731 | ce64s.addElement(ces[j], errorCode); | |
732 | } | |
733 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length); | |
734 | } | |
735 | ||
736 | uint32_t | |
737 | CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length, | |
738 | UErrorCode &errorCode) { | |
739 | if(U_FAILURE(errorCode)) { return 0; } | |
740 | // See if this sequence of CE32s has already been stored. | |
741 | int32_t first = newCE32s[0]; | |
742 | int32_t ce32sMax = ce32s.size() - length; | |
743 | for(int32_t i = 0; i <= ce32sMax; ++i) { | |
744 | if(first == ce32s.elementAti(i)) { | |
745 | if(i > Collation::MAX_INDEX) { | |
746 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
747 | return 0; | |
748 | } | |
749 | for(int32_t j = 1;; ++j) { | |
750 | if(j == length) { | |
751 | return Collation::makeCE32FromTagIndexAndLength( | |
752 | Collation::EXPANSION32_TAG, i, length); | |
753 | } | |
754 | if(ce32s.elementAti(i + j) != newCE32s[j]) { break; } | |
755 | } | |
756 | } | |
757 | } | |
758 | // Store the new sequence. | |
759 | int32_t i = ce32s.size(); | |
760 | if(i > Collation::MAX_INDEX) { | |
761 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
762 | return 0; | |
763 | } | |
764 | for(int32_t j = 0; j < length; ++j) { | |
765 | ce32s.addElement(newCE32s[j], errorCode); | |
766 | } | |
767 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length); | |
768 | } | |
769 | ||
770 | uint32_t | |
771 | CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, | |
772 | UErrorCode &errorCode) { | |
773 | if(U_FAILURE(errorCode)) { return 0; } | |
774 | if(!Collation::isSpecialCE32(ce32)) { return ce32; } | |
775 | switch(Collation::tagFromCE32(ce32)) { | |
776 | case Collation::LONG_PRIMARY_TAG: | |
777 | case Collation::LONG_SECONDARY_TAG: | |
778 | case Collation::LATIN_EXPANSION_TAG: | |
779 | // copy as is | |
780 | break; | |
781 | case Collation::EXPANSION32_TAG: { | |
782 | const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32); | |
783 | int32_t length = Collation::lengthFromCE32(ce32); | |
784 | ce32 = encodeExpansion32( | |
785 | reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode); | |
786 | break; | |
787 | } | |
788 | case Collation::EXPANSION_TAG: { | |
789 | const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32); | |
790 | int32_t length = Collation::lengthFromCE32(ce32); | |
791 | ce32 = encodeExpansion(baseCEs, length, errorCode); | |
792 | break; | |
793 | } | |
794 | case Collation::PREFIX_TAG: { | |
795 | // Flatten prefixes and nested suffixes (contractions) | |
796 | // into a linear list of ConditionalCE32. | |
797 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); | |
798 | ce32 = CollationData::readCE32(p); // Default if no prefix match. | |
799 | if(!withContext) { | |
800 | return copyFromBaseCE32(c, ce32, FALSE, errorCode); | |
801 | } | |
b331163b | 802 | ConditionalCE32 head; |
57a6839d A |
803 | UnicodeString context((UChar)0); |
804 | int32_t index; | |
805 | if(Collation::isContractionCE32(ce32)) { | |
806 | index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); | |
807 | } else { | |
808 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
809 | head.next = index = addConditionalCE32(context, ce32, errorCode); | |
810 | } | |
811 | if(U_FAILURE(errorCode)) { return 0; } | |
812 | ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far | |
813 | UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); | |
814 | while(prefixes.next(errorCode)) { | |
815 | context = prefixes.getString(); | |
816 | context.reverse(); | |
817 | context.insert(0, (UChar)context.length()); | |
818 | ce32 = (uint32_t)prefixes.getValue(); | |
819 | if(Collation::isContractionCE32(ce32)) { | |
820 | index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode); | |
821 | } else { | |
822 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
823 | cond->next = index = addConditionalCE32(context, ce32, errorCode); | |
824 | } | |
825 | if(U_FAILURE(errorCode)) { return 0; } | |
826 | cond = getConditionalCE32(index); | |
827 | } | |
828 | ce32 = makeBuilderContextCE32(head.next); | |
829 | contextChars.add(c); | |
830 | break; | |
831 | } | |
832 | case Collation::CONTRACTION_TAG: { | |
833 | if(!withContext) { | |
834 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); | |
835 | ce32 = CollationData::readCE32(p); // Default if no suffix match. | |
836 | return copyFromBaseCE32(c, ce32, FALSE, errorCode); | |
837 | } | |
b331163b | 838 | ConditionalCE32 head; |
57a6839d A |
839 | UnicodeString context((UChar)0); |
840 | copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); | |
841 | ce32 = makeBuilderContextCE32(head.next); | |
842 | contextChars.add(c); | |
843 | break; | |
844 | } | |
845 | case Collation::HANGUL_TAG: | |
846 | errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables. | |
847 | break; | |
848 | case Collation::OFFSET_TAG: | |
849 | ce32 = getCE32FromOffsetCE32(TRUE, c, ce32); | |
850 | break; | |
851 | case Collation::IMPLICIT_TAG: | |
852 | ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode); | |
853 | break; | |
854 | default: | |
3d1f044b | 855 | UPRV_UNREACHABLE; // require ce32 == base->getFinalCE32(ce32) |
57a6839d A |
856 | } |
857 | return ce32; | |
858 | } | |
859 | ||
860 | int32_t | |
861 | CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, | |
862 | ConditionalCE32 *cond, UErrorCode &errorCode) { | |
863 | if(U_FAILURE(errorCode)) { return 0; } | |
864 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); | |
865 | int32_t index; | |
866 | if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { | |
867 | // No match on the single code point. | |
868 | // We are underneath a prefix, and the default mapping is just | |
869 | // a fallback to the mappings for a shorter prefix. | |
870 | U_ASSERT(context.length() > 1); | |
871 | index = -1; | |
872 | } else { | |
873 | ce32 = CollationData::readCE32(p); // Default if no suffix match. | |
874 | U_ASSERT(!Collation::isContractionCE32(ce32)); | |
875 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
876 | cond->next = index = addConditionalCE32(context, ce32, errorCode); | |
877 | if(U_FAILURE(errorCode)) { return 0; } | |
878 | cond = getConditionalCE32(index); | |
879 | } | |
880 | ||
881 | int32_t suffixStart = context.length(); | |
882 | UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); | |
883 | while(suffixes.next(errorCode)) { | |
884 | context.append(suffixes.getString()); | |
885 | ce32 = copyFromBaseCE32(c, (uint32_t)suffixes.getValue(), TRUE, errorCode); | |
886 | cond->next = index = addConditionalCE32(context, ce32, errorCode); | |
887 | if(U_FAILURE(errorCode)) { return 0; } | |
888 | // No need to update the unsafeBackwardSet because the tailoring set | |
889 | // is already a copy of the base set. | |
890 | cond = getConditionalCE32(index); | |
891 | context.truncate(suffixStart); | |
892 | } | |
893 | U_ASSERT(index >= 0); | |
894 | return index; | |
895 | } | |
896 | ||
897 | class CopyHelper { | |
898 | public: | |
899 | CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d, | |
900 | const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode) | |
901 | : src(s), dest(d), modifier(m), | |
902 | errorCode(initialErrorCode) {} | |
903 | ||
904 | UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) { | |
905 | ce32 = copyCE32(ce32); | |
906 | utrie2_setRange32(dest.trie, start, end, ce32, TRUE, &errorCode); | |
907 | if(CollationDataBuilder::isBuilderContextCE32(ce32)) { | |
908 | dest.contextChars.add(start, end); | |
909 | } | |
910 | return U_SUCCESS(errorCode); | |
911 | } | |
912 | ||
913 | uint32_t copyCE32(uint32_t ce32) { | |
914 | if(!Collation::isSpecialCE32(ce32)) { | |
915 | int64_t ce = modifier.modifyCE32(ce32); | |
916 | if(ce != Collation::NO_CE) { | |
917 | ce32 = dest.encodeOneCE(ce, errorCode); | |
918 | } | |
919 | } else { | |
920 | int32_t tag = Collation::tagFromCE32(ce32); | |
921 | if(tag == Collation::EXPANSION32_TAG) { | |
922 | const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer()); | |
923 | srcCE32s += Collation::indexFromCE32(ce32); | |
924 | int32_t length = Collation::lengthFromCE32(ce32); | |
925 | // Inspect the source CE32s. Just copy them if none are modified. | |
926 | // Otherwise copy to modifiedCEs, with modifications. | |
927 | UBool isModified = FALSE; | |
928 | for(int32_t i = 0; i < length; ++i) { | |
929 | ce32 = srcCE32s[i]; | |
930 | int64_t ce; | |
931 | if(Collation::isSpecialCE32(ce32) || | |
932 | (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) { | |
933 | if(isModified) { | |
934 | modifiedCEs[i] = Collation::ceFromCE32(ce32); | |
935 | } | |
936 | } else { | |
937 | if(!isModified) { | |
938 | for(int32_t j = 0; j < i; ++j) { | |
939 | modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]); | |
940 | } | |
941 | isModified = TRUE; | |
942 | } | |
943 | modifiedCEs[i] = ce; | |
944 | } | |
945 | } | |
946 | if(isModified) { | |
947 | ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); | |
948 | } else { | |
949 | ce32 = dest.encodeExpansion32( | |
950 | reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode); | |
951 | } | |
952 | } else if(tag == Collation::EXPANSION_TAG) { | |
953 | const int64_t *srcCEs = src.ce64s.getBuffer(); | |
954 | srcCEs += Collation::indexFromCE32(ce32); | |
955 | int32_t length = Collation::lengthFromCE32(ce32); | |
956 | // Inspect the source CEs. Just copy them if none are modified. | |
957 | // Otherwise copy to modifiedCEs, with modifications. | |
958 | UBool isModified = FALSE; | |
959 | for(int32_t i = 0; i < length; ++i) { | |
960 | int64_t srcCE = srcCEs[i]; | |
961 | int64_t ce = modifier.modifyCE(srcCE); | |
962 | if(ce == Collation::NO_CE) { | |
963 | if(isModified) { | |
964 | modifiedCEs[i] = srcCE; | |
965 | } | |
966 | } else { | |
967 | if(!isModified) { | |
968 | for(int32_t j = 0; j < i; ++j) { | |
969 | modifiedCEs[j] = srcCEs[j]; | |
970 | } | |
971 | isModified = TRUE; | |
972 | } | |
973 | modifiedCEs[i] = ce; | |
974 | } | |
975 | } | |
976 | if(isModified) { | |
977 | ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); | |
978 | } else { | |
979 | ce32 = dest.encodeExpansion(srcCEs, length, errorCode); | |
980 | } | |
981 | } else if(tag == Collation::BUILDER_DATA_TAG) { | |
982 | // Copy the list of ConditionalCE32. | |
983 | ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32); | |
984 | U_ASSERT(!cond->hasContext()); | |
985 | int32_t destIndex = dest.addConditionalCE32( | |
986 | cond->context, copyCE32(cond->ce32), errorCode); | |
987 | ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex); | |
988 | while(cond->next >= 0) { | |
989 | cond = src.getConditionalCE32(cond->next); | |
990 | ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex); | |
991 | destIndex = dest.addConditionalCE32( | |
992 | cond->context, copyCE32(cond->ce32), errorCode); | |
993 | int32_t suffixStart = cond->prefixLength() + 1; | |
994 | dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart)); | |
995 | prevDestCond->next = destIndex; | |
996 | } | |
997 | } else { | |
998 | // Just copy long CEs and Latin mini expansions (and other expected values) as is, | |
999 | // assuming that the modifier would not modify them. | |
1000 | U_ASSERT(tag == Collation::LONG_PRIMARY_TAG || | |
1001 | tag == Collation::LONG_SECONDARY_TAG || | |
1002 | tag == Collation::LATIN_EXPANSION_TAG || | |
1003 | tag == Collation::HANGUL_TAG); | |
1004 | } | |
1005 | } | |
1006 | return ce32; | |
1007 | } | |
1008 | ||
1009 | const CollationDataBuilder &src; | |
1010 | CollationDataBuilder &dest; | |
1011 | const CollationDataBuilder::CEModifier &modifier; | |
1012 | int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH]; | |
1013 | UErrorCode errorCode; | |
1014 | }; | |
1015 | ||
1016 | U_CDECL_BEGIN | |
1017 | ||
1018 | static UBool U_CALLCONV | |
1019 | enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) { | |
1020 | return | |
1021 | value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 || | |
1022 | ((CopyHelper *)context)->copyRangeCE32(start, end, value); | |
1023 | } | |
1024 | ||
1025 | U_CDECL_END | |
1026 | ||
1027 | void | |
1028 | CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, | |
1029 | UErrorCode &errorCode) { | |
1030 | if(U_FAILURE(errorCode)) { return; } | |
1031 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
1032 | errorCode = U_INVALID_STATE_ERROR; | |
1033 | return; | |
1034 | } | |
1035 | CopyHelper helper(src, *this, modifier, errorCode); | |
1036 | utrie2_enum(src.trie, NULL, enumRangeForCopy, &helper); | |
1037 | errorCode = helper.errorCode; | |
1038 | // Update the contextChars and the unsafeBackwardSet while copying, | |
1039 | // in case a character had conditional mappings in the source builder | |
1040 | // and they were removed later. | |
1041 | modified |= src.modified; | |
1042 | } | |
1043 | ||
1044 | void | |
1045 | CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) { | |
1046 | if(U_FAILURE(errorCode) || set.isEmpty()) { return; } | |
1047 | UnicodeSetIterator iter(set); | |
1048 | while(iter.next() && !iter.isString()) { | |
1049 | UChar32 c = iter.getCodepoint(); | |
1050 | uint32_t ce32 = utrie2_get32(trie, c); | |
1051 | if(ce32 == Collation::FALLBACK_CE32) { | |
1052 | ce32 = base->getFinalCE32(base->getCE32(c)); | |
1053 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); | |
1054 | utrie2_set32(trie, c, ce32, &errorCode); | |
1055 | } | |
1056 | } | |
1057 | modified = TRUE; | |
1058 | } | |
1059 | ||
1060 | void | |
1061 | CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) { | |
1062 | if(U_FAILURE(errorCode) || set.isEmpty()) { return; } | |
1063 | UnicodeSetIterator iter(set); | |
1064 | while(iter.next() && !iter.isString()) { | |
1065 | UChar32 c = iter.getCodepoint(); | |
1066 | uint32_t ce32 = utrie2_get32(trie, c); | |
1067 | if(ce32 == Collation::FALLBACK_CE32) { | |
1068 | ce32 = base->getFinalCE32(base->getCE32(c)); | |
1069 | if(Collation::ce32HasContext(ce32)) { | |
1070 | ce32 = copyFromBaseCE32(c, ce32, FALSE /* without context */, errorCode); | |
1071 | utrie2_set32(trie, c, ce32, &errorCode); | |
1072 | } | |
1073 | } else if(isBuilderContextCE32(ce32)) { | |
1074 | ce32 = getConditionalCE32ForCE32(ce32)->ce32; | |
1075 | // Simply abandon the list of ConditionalCE32. | |
1076 | // The caller will copy this builder in the end, | |
1077 | // eliminating unreachable data. | |
1078 | utrie2_set32(trie, c, ce32, &errorCode); | |
1079 | contextChars.remove(c); | |
1080 | } | |
1081 | } | |
1082 | modified = TRUE; | |
1083 | } | |
1084 | ||
1085 | UBool | |
1086 | CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) { | |
1087 | if(U_FAILURE(errorCode)) { return FALSE; } | |
1088 | UBool anyJamoAssigned = base == NULL; // always set jamoCE32s in the base data | |
1089 | UBool needToCopyFromBase = FALSE; | |
1090 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. | |
1091 | UChar32 jamo = jamoCpFromIndex(j); | |
1092 | UBool fromBase = FALSE; | |
1093 | uint32_t ce32 = utrie2_get32(trie, jamo); | |
1094 | anyJamoAssigned |= Collation::isAssignedCE32(ce32); | |
1095 | // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. | |
1096 | // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) | |
1097 | if(ce32 == Collation::FALLBACK_CE32) { | |
1098 | fromBase = TRUE; | |
1099 | ce32 = base->getCE32(jamo); | |
1100 | } | |
1101 | if(Collation::isSpecialCE32(ce32)) { | |
1102 | switch(Collation::tagFromCE32(ce32)) { | |
1103 | case Collation::LONG_PRIMARY_TAG: | |
1104 | case Collation::LONG_SECONDARY_TAG: | |
1105 | case Collation::LATIN_EXPANSION_TAG: | |
1106 | // Copy the ce32 as-is. | |
1107 | break; | |
1108 | case Collation::EXPANSION32_TAG: | |
1109 | case Collation::EXPANSION_TAG: | |
1110 | case Collation::PREFIX_TAG: | |
1111 | case Collation::CONTRACTION_TAG: | |
1112 | if(fromBase) { | |
1113 | // Defer copying until we know if anyJamoAssigned. | |
1114 | ce32 = Collation::FALLBACK_CE32; | |
1115 | needToCopyFromBase = TRUE; | |
1116 | } | |
1117 | break; | |
1118 | case Collation::IMPLICIT_TAG: | |
1119 | // An unassigned Jamo should only occur in tests with incomplete bases. | |
1120 | U_ASSERT(fromBase); | |
1121 | ce32 = Collation::FALLBACK_CE32; | |
1122 | needToCopyFromBase = TRUE; | |
1123 | break; | |
1124 | case Collation::OFFSET_TAG: | |
1125 | ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); | |
1126 | break; | |
1127 | case Collation::FALLBACK_TAG: | |
1128 | case Collation::RESERVED_TAG_3: | |
1129 | case Collation::BUILDER_DATA_TAG: | |
1130 | case Collation::DIGIT_TAG: | |
1131 | case Collation::U0000_TAG: | |
1132 | case Collation::HANGUL_TAG: | |
1133 | case Collation::LEAD_SURROGATE_TAG: | |
1134 | errorCode = U_INTERNAL_PROGRAM_ERROR; | |
1135 | return FALSE; | |
1136 | } | |
1137 | } | |
1138 | jamoCE32s[j] = ce32; | |
1139 | } | |
1140 | if(anyJamoAssigned && needToCopyFromBase) { | |
1141 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { | |
1142 | if(jamoCE32s[j] == Collation::FALLBACK_CE32) { | |
1143 | UChar32 jamo = jamoCpFromIndex(j); | |
1144 | jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo), | |
1145 | /*withContext=*/ TRUE, errorCode); | |
1146 | } | |
1147 | } | |
1148 | } | |
1149 | return anyJamoAssigned && U_SUCCESS(errorCode); | |
1150 | } | |
1151 | ||
1152 | void | |
1153 | CollationDataBuilder::setDigitTags(UErrorCode &errorCode) { | |
1154 | UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode); | |
1155 | if(U_FAILURE(errorCode)) { return; } | |
1156 | UnicodeSetIterator iter(digits); | |
1157 | while(iter.next()) { | |
1158 | U_ASSERT(!iter.isString()); | |
1159 | UChar32 c = iter.getCodepoint(); | |
1160 | uint32_t ce32 = utrie2_get32(trie, c); | |
1161 | if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) { | |
1162 | int32_t index = addCE32(ce32, errorCode); | |
1163 | if(U_FAILURE(errorCode)) { return; } | |
1164 | if(index > Collation::MAX_INDEX) { | |
1165 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
1166 | return; | |
1167 | } | |
1168 | ce32 = Collation::makeCE32FromTagIndexAndLength( | |
1169 | Collation::DIGIT_TAG, index, u_charDigitValue(c)); | |
1170 | utrie2_set32(trie, c, ce32, &errorCode); | |
1171 | } | |
1172 | } | |
1173 | } | |
1174 | ||
1175 | U_CDECL_BEGIN | |
1176 | ||
1177 | static UBool U_CALLCONV | |
1178 | enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { | |
1179 | int32_t *pValue = (int32_t *)context; | |
1180 | if(value == Collation::UNASSIGNED_CE32) { | |
1181 | value = Collation::LEAD_ALL_UNASSIGNED; | |
1182 | } else if(value == Collation::FALLBACK_CE32) { | |
1183 | value = Collation::LEAD_ALL_FALLBACK; | |
1184 | } else { | |
1185 | *pValue = Collation::LEAD_MIXED; | |
1186 | return FALSE; | |
1187 | } | |
1188 | if(*pValue < 0) { | |
1189 | *pValue = (int32_t)value; | |
1190 | } else if(*pValue != (int32_t)value) { | |
1191 | *pValue = Collation::LEAD_MIXED; | |
1192 | return FALSE; | |
1193 | } | |
1194 | return TRUE; | |
1195 | } | |
1196 | ||
1197 | U_CDECL_END | |
1198 | ||
1199 | void | |
1200 | CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) { | |
1201 | for(UChar lead = 0xd800; lead < 0xdc00; ++lead) { | |
1202 | int32_t value = -1; | |
1203 | utrie2_enumForLeadSurrogate(trie, lead, NULL, enumRangeLeadValue, &value); | |
1204 | utrie2_set32ForLeadSurrogateCodeUnit( | |
1205 | trie, lead, | |
1206 | Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | (uint32_t)value, | |
1207 | &errorCode); | |
1208 | } | |
1209 | } | |
1210 | ||
1211 | void | |
1212 | CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) { | |
1213 | buildMappings(data, errorCode); | |
1214 | if(base != NULL) { | |
1215 | data.numericPrimary = base->numericPrimary; | |
1216 | data.compressibleBytes = base->compressibleBytes; | |
b331163b A |
1217 | data.numScripts = base->numScripts; |
1218 | data.scriptsIndex = base->scriptsIndex; | |
1219 | data.scriptStarts = base->scriptStarts; | |
1220 | data.scriptStartsLength = base->scriptStartsLength; | |
57a6839d A |
1221 | } |
1222 | buildFastLatinTable(data, errorCode); | |
1223 | } | |
1224 | ||
1225 | void | |
1226 | CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) { | |
1227 | if(U_FAILURE(errorCode)) { return; } | |
1228 | if(trie == NULL || utrie2_isFrozen(trie)) { | |
1229 | errorCode = U_INVALID_STATE_ERROR; | |
1230 | return; | |
1231 | } | |
1232 | ||
1233 | buildContexts(errorCode); | |
1234 | ||
1235 | uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; | |
1236 | int32_t jamoIndex = -1; | |
1237 | if(getJamoCE32s(jamoCE32s, errorCode)) { | |
1238 | jamoIndex = ce32s.size(); | |
1239 | for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) { | |
1240 | ce32s.addElement((int32_t)jamoCE32s[i], errorCode); | |
1241 | } | |
1242 | // Small optimization: Use a bit in the Hangul ce32 | |
1243 | // to indicate that none of the Jamo CE32s are isSpecialCE32() | |
1244 | // (as it should be in the root collator). | |
1245 | // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. | |
1246 | // In order to still have good trie compression and keep this code simple, | |
1247 | // we only set this flag if a whole block of 588 Hangul syllables starting with | |
1248 | // a common leading consonant (Jamo L) has this property. | |
1249 | UBool isAnyJamoVTSpecial = FALSE; | |
1250 | for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) { | |
1251 | if(Collation::isSpecialCE32(jamoCE32s[i])) { | |
1252 | isAnyJamoVTSpecial = TRUE; | |
1253 | break; | |
1254 | } | |
1255 | } | |
1256 | uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); | |
1257 | UChar32 c = Hangul::HANGUL_BASE; | |
1258 | for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L | |
1259 | uint32_t ce32 = hangulCE32; | |
1260 | if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) { | |
1261 | ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO; | |
1262 | } | |
1263 | UChar32 limit = c + Hangul::JAMO_VT_COUNT; | |
1264 | utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); | |
1265 | c = limit; | |
1266 | } | |
1267 | } else { | |
1268 | // Copy the Hangul CE32s from the base in blocks per Jamo L, | |
1269 | // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. | |
1270 | for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) { | |
1271 | uint32_t ce32 = base->getCE32(c); | |
1272 | U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG)); | |
1273 | UChar32 limit = c + Hangul::JAMO_VT_COUNT; | |
1274 | utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); | |
1275 | c = limit; | |
1276 | } | |
1277 | } | |
1278 | ||
1279 | setDigitTags(errorCode); | |
1280 | setLeadSurrogates(errorCode); | |
1281 | ||
1282 | // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. | |
1283 | ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0); | |
1284 | utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode); | |
1285 | ||
1286 | utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode); | |
1287 | if(U_FAILURE(errorCode)) { return; } | |
1288 | ||
1289 | // Mark each lead surrogate as "unsafe" | |
1290 | // if any of its 1024 associated supplementary code points is "unsafe". | |
1291 | UChar32 c = 0x10000; | |
1292 | for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { | |
1293 | if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { | |
1294 | unsafeBackwardSet.add(lead); | |
1295 | } | |
1296 | } | |
1297 | unsafeBackwardSet.freeze(); | |
1298 | ||
1299 | data.trie = trie; | |
1300 | data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer()); | |
1301 | data.ces = ce64s.getBuffer(); | |
1302 | data.contexts = contexts.getBuffer(); | |
1303 | ||
1304 | data.ce32sLength = ce32s.size(); | |
1305 | data.cesLength = ce64s.size(); | |
1306 | data.contextsLength = contexts.length(); | |
1307 | ||
1308 | data.base = base; | |
1309 | if(jamoIndex >= 0) { | |
1310 | data.jamoCE32s = data.ce32s + jamoIndex; | |
1311 | } else { | |
1312 | data.jamoCE32s = base->jamoCE32s; | |
1313 | } | |
1314 | data.unsafeBackwardSet = &unsafeBackwardSet; | |
1315 | } | |
1316 | ||
1317 | void | |
1318 | CollationDataBuilder::clearContexts() { | |
1319 | contexts.remove(); | |
1320 | UnicodeSetIterator iter(contextChars); | |
1321 | while(iter.next()) { | |
1322 | U_ASSERT(!iter.isString()); | |
1323 | uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint()); | |
1324 | U_ASSERT(isBuilderContextCE32(ce32)); | |
1325 | getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32; | |
1326 | } | |
1327 | } | |
1328 | ||
1329 | void | |
1330 | CollationDataBuilder::buildContexts(UErrorCode &errorCode) { | |
1331 | if(U_FAILURE(errorCode)) { return; } | |
1332 | // Ignore abandoned lists and the cached builtCE32, | |
1333 | // and build all contexts from scratch. | |
1334 | contexts.remove(); | |
1335 | UnicodeSetIterator iter(contextChars); | |
1336 | while(U_SUCCESS(errorCode) && iter.next()) { | |
1337 | U_ASSERT(!iter.isString()); | |
1338 | UChar32 c = iter.getCodepoint(); | |
1339 | uint32_t ce32 = utrie2_get32(trie, c); | |
1340 | if(!isBuilderContextCE32(ce32)) { | |
1341 | // Impossible: No context data for c in contextChars. | |
1342 | errorCode = U_INTERNAL_PROGRAM_ERROR; | |
1343 | return; | |
1344 | } | |
1345 | ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32); | |
1346 | ce32 = buildContext(cond, errorCode); | |
1347 | utrie2_set32(trie, c, ce32, &errorCode); | |
1348 | } | |
1349 | } | |
1350 | ||
1351 | uint32_t | |
1352 | CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) { | |
1353 | if(U_FAILURE(errorCode)) { return 0; } | |
1354 | // The list head must have no context. | |
1355 | U_ASSERT(!head->hasContext()); | |
1356 | // The list head must be followed by one or more nodes that all do have context. | |
1357 | U_ASSERT(head->next >= 0); | |
1358 | UCharsTrieBuilder prefixBuilder(errorCode); | |
1359 | UCharsTrieBuilder contractionBuilder(errorCode); | |
1360 | for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) { | |
1361 | // After the list head, the prefix or suffix can be empty, but not both. | |
1362 | U_ASSERT(cond == head || cond->hasContext()); | |
1363 | int32_t prefixLength = cond->prefixLength(); | |
1364 | UnicodeString prefix(cond->context, 0, prefixLength + 1); | |
1365 | // Collect all contraction suffixes for one prefix. | |
1366 | ConditionalCE32 *firstCond = cond; | |
1367 | ConditionalCE32 *lastCond = cond; | |
1368 | while(cond->next >= 0 && | |
1369 | (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) { | |
1370 | lastCond = cond; | |
1371 | } | |
1372 | uint32_t ce32; | |
1373 | int32_t suffixStart = prefixLength + 1; // == prefix.length() | |
1374 | if(lastCond->context.length() == suffixStart) { | |
1375 | // One prefix without contraction suffix. | |
1376 | U_ASSERT(firstCond == lastCond); | |
1377 | ce32 = lastCond->ce32; | |
1378 | cond = lastCond; | |
1379 | } else { | |
1380 | // Build the contractions trie. | |
1381 | contractionBuilder.clear(); | |
1382 | // Entry for an empty suffix, to be stored before the trie. | |
b331163b | 1383 | uint32_t emptySuffixCE32 = 0; |
57a6839d A |
1384 | uint32_t flags = 0; |
1385 | if(firstCond->context.length() == suffixStart) { | |
1386 | // There is a mapping for the prefix and the single character c. (p|c) | |
1387 | // If no other suffix matches, then we return this value. | |
1388 | emptySuffixCE32 = firstCond->ce32; | |
1389 | cond = getConditionalCE32(firstCond->next); | |
1390 | } else { | |
1391 | // There is no mapping for the prefix and just the single character. | |
1392 | // (There is no p|c, only p|cd, p|ce etc.) | |
1393 | flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH; | |
1394 | // When the prefix matches but none of the prefix-specific suffixes, | |
1395 | // then we fall back to the mappings with the next-longest prefix, | |
1396 | // and ultimately to mappings with no prefix. | |
1397 | // Each fallback might be another set of contractions. | |
1398 | // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, | |
1399 | // then in text "pch" we find the ch contraction. | |
1400 | for(cond = head;; cond = getConditionalCE32(cond->next)) { | |
1401 | int32_t length = cond->prefixLength(); | |
1402 | if(length == prefixLength) { break; } | |
1403 | if(cond->defaultCE32 != Collation::NO_CE32 && | |
1404 | (length==0 || prefix.endsWith(cond->context, 1, length))) { | |
1405 | emptySuffixCE32 = cond->defaultCE32; | |
1406 | } | |
1407 | } | |
1408 | cond = firstCond; | |
1409 | } | |
1410 | // Optimization: Set a flag when | |
1411 | // the first character of every contraction suffix has lccc!=0. | |
1412 | // Short-circuits contraction matching when a normal letter follows. | |
1413 | flags |= Collation::CONTRACT_NEXT_CCC; | |
1414 | // Add all of the non-empty suffixes into the contraction trie. | |
1415 | for(;;) { | |
1416 | UnicodeString suffix(cond->context, suffixStart); | |
1417 | uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0)); | |
1418 | if(fcd16 <= 0xff) { | |
1419 | flags &= ~Collation::CONTRACT_NEXT_CCC; | |
1420 | } | |
1421 | fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1)); | |
1422 | if(fcd16 > 0xff) { | |
1423 | // The last suffix character has lccc!=0, allowing for discontiguous contractions. | |
1424 | flags |= Collation::CONTRACT_TRAILING_CCC; | |
1425 | } | |
1426 | contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode); | |
1427 | if(cond == lastCond) { break; } | |
1428 | cond = getConditionalCE32(cond->next); | |
1429 | } | |
1430 | int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode); | |
1431 | if(U_FAILURE(errorCode)) { return 0; } | |
1432 | if(index > Collation::MAX_INDEX) { | |
1433 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
1434 | return 0; | |
1435 | } | |
1436 | ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags; | |
1437 | } | |
1438 | U_ASSERT(cond == lastCond); | |
1439 | firstCond->defaultCE32 = ce32; | |
1440 | if(prefixLength == 0) { | |
1441 | if(cond->next < 0) { | |
1442 | // No non-empty prefixes, only contractions. | |
1443 | return ce32; | |
1444 | } | |
1445 | } else { | |
1446 | prefix.remove(0, 1); // Remove the length unit. | |
1447 | prefix.reverse(); | |
1448 | prefixBuilder.add(prefix, (int32_t)ce32, errorCode); | |
1449 | if(cond->next < 0) { break; } | |
1450 | } | |
1451 | } | |
1452 | U_ASSERT(head->defaultCE32 != Collation::NO_CE32); | |
1453 | int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode); | |
1454 | if(U_FAILURE(errorCode)) { return 0; } | |
1455 | if(index > Collation::MAX_INDEX) { | |
1456 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
1457 | return 0; | |
1458 | } | |
1459 | return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index); | |
1460 | } | |
1461 | ||
1462 | int32_t | |
1463 | CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, | |
1464 | UErrorCode &errorCode) { | |
1465 | UnicodeString context; | |
1466 | context.append((UChar)(defaultCE32 >> 16)).append((UChar)defaultCE32); | |
1467 | UnicodeString trieString; | |
1468 | context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode)); | |
1469 | if(U_FAILURE(errorCode)) { return -1; } | |
1470 | int32_t index = contexts.indexOf(context); | |
1471 | if(index < 0) { | |
1472 | index = contexts.length(); | |
1473 | contexts.append(context); | |
1474 | } | |
1475 | return index; | |
1476 | } | |
1477 | ||
1478 | void | |
1479 | CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) { | |
1480 | if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; } | |
1481 | ||
1482 | delete fastLatinBuilder; | |
1483 | fastLatinBuilder = new CollationFastLatinBuilder(errorCode); | |
1484 | if(fastLatinBuilder == NULL) { | |
1485 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
1486 | return; | |
1487 | } | |
1488 | if(fastLatinBuilder->forData(data, errorCode)) { | |
1489 | const uint16_t *table = fastLatinBuilder->getTable(); | |
1490 | int32_t length = fastLatinBuilder->lengthOfTable(); | |
1491 | if(base != NULL && length == base->fastLatinTableLength && | |
1492 | uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) { | |
1493 | // Same fast Latin table as in the base, use that one instead. | |
1494 | delete fastLatinBuilder; | |
1495 | fastLatinBuilder = NULL; | |
1496 | table = base->fastLatinTable; | |
1497 | } | |
1498 | data.fastLatinTable = table; | |
1499 | data.fastLatinTableLength = length; | |
1500 | } else { | |
1501 | delete fastLatinBuilder; | |
1502 | fastLatinBuilder = NULL; | |
1503 | } | |
1504 | } | |
1505 | ||
1506 | int32_t | |
1507 | CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) { | |
1508 | return getCEs(s, 0, ces, cesLength); | |
1509 | } | |
1510 | ||
1511 | int32_t | |
1512 | CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s, | |
1513 | int64_t ces[], int32_t cesLength) { | |
1514 | int32_t prefixLength = prefix.length(); | |
1515 | if(prefixLength == 0) { | |
1516 | return getCEs(s, 0, ces, cesLength); | |
1517 | } else { | |
1518 | return getCEs(prefix + s, prefixLength, ces, cesLength); | |
1519 | } | |
1520 | } | |
1521 | ||
1522 | int32_t | |
1523 | CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start, | |
1524 | int64_t ces[], int32_t cesLength) { | |
1525 | if(collIter == NULL) { | |
1526 | collIter = new DataBuilderCollationIterator(*this); | |
1527 | if(collIter == NULL) { return 0; } | |
1528 | } | |
1529 | return collIter->fetchCEs(s, start, ces, cesLength); | |
1530 | } | |
1531 | ||
1532 | U_NAMESPACE_END | |
1533 | ||
1534 | #endif // !UCONFIG_NO_COLLATION |