1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * created on: 2012jul28
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/ucol.h"
19 #include "unicode/udata.h"
20 #include "unicode/uscript.h"
22 #include "collation.h"
23 #include "collationdata.h"
31 CollationData::getIndirectCE32(uint32_t ce32
) const {
32 U_ASSERT(Collation::isSpecialCE32(ce32
));
33 int32_t tag
= Collation::tagFromCE32(ce32
);
34 if(tag
== Collation::DIGIT_TAG
) {
35 // Fetch the non-numeric-collation CE32.
36 ce32
= ce32s
[Collation::indexFromCE32(ce32
)];
37 } else if(tag
== Collation::LEAD_SURROGATE_TAG
) {
38 ce32
= Collation::UNASSIGNED_CE32
;
39 } else if(tag
== Collation::U0000_TAG
) {
40 // Fetch the normal ce32 for U+0000.
47 CollationData::getFinalCE32(uint32_t ce32
) const {
48 if(Collation::isSpecialCE32(ce32
)) {
49 ce32
= getIndirectCE32(ce32
);
55 CollationData::getSingleCE(UChar32 c
, UErrorCode
&errorCode
) const {
56 if(U_FAILURE(errorCode
)) { return 0; }
57 // Keep parallel with CollationDataBuilder::getSingleCE().
58 const CollationData
*d
;
59 uint32_t ce32
= getCE32(c
);
60 if(ce32
== Collation::FALLBACK_CE32
) {
62 ce32
= base
->getCE32(c
);
66 while(Collation::isSpecialCE32(ce32
)) {
67 switch(Collation::tagFromCE32(ce32
)) {
68 case Collation::LATIN_EXPANSION_TAG
:
69 case Collation::BUILDER_DATA_TAG
:
70 case Collation::PREFIX_TAG
:
71 case Collation::CONTRACTION_TAG
:
72 case Collation::HANGUL_TAG
:
73 case Collation::LEAD_SURROGATE_TAG
:
74 errorCode
= U_UNSUPPORTED_ERROR
;
76 case Collation::FALLBACK_TAG
:
77 case Collation::RESERVED_TAG_3
:
78 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
80 case Collation::LONG_PRIMARY_TAG
:
81 return Collation::ceFromLongPrimaryCE32(ce32
);
82 case Collation::LONG_SECONDARY_TAG
:
83 return Collation::ceFromLongSecondaryCE32(ce32
);
84 case Collation::EXPANSION32_TAG
:
85 if(Collation::lengthFromCE32(ce32
) == 1) {
86 ce32
= d
->ce32s
[Collation::indexFromCE32(ce32
)];
89 errorCode
= U_UNSUPPORTED_ERROR
;
92 case Collation::EXPANSION_TAG
: {
93 if(Collation::lengthFromCE32(ce32
) == 1) {
94 return d
->ces
[Collation::indexFromCE32(ce32
)];
96 errorCode
= U_UNSUPPORTED_ERROR
;
100 case Collation::DIGIT_TAG
:
101 // Fetch the non-numeric-collation CE32 and continue.
102 ce32
= d
->ce32s
[Collation::indexFromCE32(ce32
)];
104 case Collation::U0000_TAG
:
106 // Fetch the normal ce32 for U+0000 and continue.
109 case Collation::OFFSET_TAG
:
110 return d
->getCEFromOffsetCE32(c
, ce32
);
111 case Collation::IMPLICIT_TAG
:
112 return Collation::unassignedCEFromCodePoint(c
);
115 return Collation::ceFromSimpleCE32(ce32
);
119 CollationData::getFirstPrimaryForGroup(int32_t script
) const {
120 int32_t index
= getScriptIndex(script
);
121 return index
== 0 ? 0 : (uint32_t)scriptStarts
[index
] << 16;
125 CollationData::getLastPrimaryForGroup(int32_t script
) const {
126 int32_t index
= getScriptIndex(script
);
130 uint32_t limit
= scriptStarts
[index
+ 1];
131 return (limit
<< 16) - 1;
135 CollationData::getGroupForPrimary(uint32_t p
) const {
137 if(p
< scriptStarts
[1] || scriptStarts
[scriptStartsLength
- 1] <= p
) {
141 while(p
>= scriptStarts
[index
+ 1]) { ++index
; }
142 for(int32_t i
= 0; i
< numScripts
; ++i
) {
143 if(scriptsIndex
[i
] == index
) {
147 for(int32_t i
= 0; i
< MAX_NUM_SPECIAL_REORDER_CODES
; ++i
) {
148 if(scriptsIndex
[numScripts
+ i
] == index
) {
149 return UCOL_REORDER_CODE_FIRST
+ i
;
156 CollationData::getScriptIndex(int32_t script
) const {
159 } else if(script
< numScripts
) {
160 return scriptsIndex
[script
];
161 } else if(script
< UCOL_REORDER_CODE_FIRST
) {
164 script
-= UCOL_REORDER_CODE_FIRST
;
165 if(script
< MAX_NUM_SPECIAL_REORDER_CODES
) {
166 return scriptsIndex
[numScripts
+ script
];
174 CollationData::getEquivalentScripts(int32_t script
,
175 int32_t dest
[], int32_t capacity
,
176 UErrorCode
&errorCode
) const {
177 if(U_FAILURE(errorCode
)) { return 0; }
178 int32_t index
= getScriptIndex(script
);
179 if(index
== 0) { return 0; }
180 if(script
>= UCOL_REORDER_CODE_FIRST
) {
181 // Special groups have no aliases.
185 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
191 for(int32_t i
= 0; i
< numScripts
; ++i
) {
192 if(scriptsIndex
[i
] == index
) {
193 if(length
< capacity
) {
199 if(length
> capacity
) {
200 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
206 CollationData::makeReorderRanges(const int32_t *reorder
, int32_t length
,
207 UVector32
&ranges
, UErrorCode
&errorCode
) const {
208 makeReorderRanges(reorder
, length
, FALSE
, ranges
, errorCode
);
212 CollationData::makeReorderRanges(const int32_t *reorder
, int32_t length
,
214 UVector32
&ranges
, UErrorCode
&errorCode
) const {
215 if(U_FAILURE(errorCode
)) { return; }
216 ranges
.removeAllElements();
217 if(length
== 0 || (length
== 1 && reorder
[0] == USCRIPT_UNKNOWN
)) {
221 // Maps each script-or-group range to a new lead byte.
222 uint8_t table
[MAX_NUM_SCRIPT_RANGES
];
223 uprv_memset(table
, 0, sizeof(table
));
226 // Set "don't care" values for reserved ranges.
227 int32_t index
= scriptsIndex
[
228 numScripts
+ REORDER_RESERVED_BEFORE_LATIN
- UCOL_REORDER_CODE_FIRST
];
232 index
= scriptsIndex
[
233 numScripts
+ REORDER_RESERVED_AFTER_LATIN
- UCOL_REORDER_CODE_FIRST
];
239 // Never reorder special low and high primary lead bytes.
240 U_ASSERT(scriptStartsLength
>= 2);
241 U_ASSERT(scriptStarts
[0] == 0);
242 int32_t lowStart
= scriptStarts
[1];
243 U_ASSERT(lowStart
== ((Collation::MERGE_SEPARATOR_BYTE
+ 1) << 8));
244 int32_t highLimit
= scriptStarts
[scriptStartsLength
- 1];
245 U_ASSERT(highLimit
== (Collation::TRAIL_WEIGHT_BYTE
<< 8));
247 // Get the set of special reorder codes in the input list.
248 // This supports a fixed number of special reorder codes;
249 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
250 uint32_t specials
= 0;
251 for(int32_t i
= 0; i
< length
; ++i
) {
252 int32_t reorderCode
= reorder
[i
] - UCOL_REORDER_CODE_FIRST
;
253 if(0 <= reorderCode
&& reorderCode
< MAX_NUM_SPECIAL_REORDER_CODES
) {
254 specials
|= (uint32_t)1 << reorderCode
;
258 // Start the reordering with the special low reorder codes that do not occur in the input.
259 for(int32_t i
= 0; i
< MAX_NUM_SPECIAL_REORDER_CODES
; ++i
) {
260 int32_t index
= scriptsIndex
[numScripts
+ i
];
261 if(index
!= 0 && (specials
& ((uint32_t)1 << i
)) == 0) {
262 lowStart
= addLowScriptRange(table
, index
, lowStart
);
266 // Skip the reserved range before Latin if Latin is the first script,
267 // so that we do not move it unnecessarily.
268 int32_t skippedReserved
= 0;
269 if(specials
== 0 && reorder
[0] == USCRIPT_LATIN
&& !latinMustMove
) {
270 int32_t index
= scriptsIndex
[USCRIPT_LATIN
];
271 U_ASSERT(index
!= 0);
272 int32_t start
= scriptStarts
[index
];
273 U_ASSERT(lowStart
<= start
);
274 skippedReserved
= start
- lowStart
;
278 // Reorder according to the input scripts, continuing from the bottom of the primary range.
279 int32_t originalLength
= length
; // length will be decremented if "others" is in the list.
280 UBool hasReorderToEnd
= FALSE
;
281 for(int32_t i
= 0; i
< length
;) {
282 int32_t script
= reorder
[i
++];
283 if(script
== USCRIPT_UNKNOWN
) {
284 // Put the remaining scripts at the top.
285 hasReorderToEnd
= TRUE
;
287 script
= reorder
[--length
];
288 if(script
== USCRIPT_UNKNOWN
|| // Must occur at most once.
289 script
== UCOL_REORDER_CODE_DEFAULT
) {
290 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
293 int32_t index
= getScriptIndex(script
);
294 if(index
== 0) { continue; }
295 if(table
[index
] != 0) { // Duplicate or equivalent script.
296 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
299 highLimit
= addHighScriptRange(table
, index
, highLimit
);
303 if(script
== UCOL_REORDER_CODE_DEFAULT
) {
304 // The default code must be the only one in the list, and that is handled by the caller.
305 // Otherwise it must not be used.
306 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
309 int32_t index
= getScriptIndex(script
);
310 if(index
== 0) { continue; }
311 if(table
[index
] != 0) { // Duplicate or equivalent script.
312 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
315 lowStart
= addLowScriptRange(table
, index
, lowStart
);
318 // Put all remaining scripts into the middle.
319 for(int32_t i
= 1; i
< scriptStartsLength
- 1; ++i
) {
320 int32_t leadByte
= table
[i
];
321 if(leadByte
!= 0) { continue; }
322 int32_t start
= scriptStarts
[i
];
323 if(!hasReorderToEnd
&& start
> lowStart
) {
324 // No need to move this script.
327 lowStart
= addLowScriptRange(table
, i
, lowStart
);
329 if(lowStart
> highLimit
) {
330 if((lowStart
- (skippedReserved
& 0xff00)) <= highLimit
) {
331 // Try not skipping the before-Latin reserved range.
332 makeReorderRanges(reorder
, originalLength
, TRUE
, ranges
, errorCode
);
335 // We need more primary lead bytes than available, despite the reserved ranges.
336 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
340 // Turn lead bytes into a list of (limit, offset) pairs.
341 // Encode each pair in one list element:
342 // Upper 16 bits = limit, lower 16 = signed lead byte offset.
344 for(int32_t i
= 1;; ++i
) {
345 int32_t nextOffset
= offset
;
346 while(i
< scriptStartsLength
- 1) {
347 int32_t newLeadByte
= table
[i
];
348 if(newLeadByte
== 0xff) {
349 // "Don't care" lead byte for reserved range, continue with current offset.
351 nextOffset
= newLeadByte
- (scriptStarts
[i
] >> 8);
352 if(nextOffset
!= offset
) { break; }
356 if(offset
!= 0 || i
< scriptStartsLength
- 1) {
357 ranges
.addElement(((int32_t)scriptStarts
[i
] << 16) | (offset
& 0xffff), errorCode
);
359 if(i
== scriptStartsLength
- 1) { break; }
365 CollationData::addLowScriptRange(uint8_t table
[], int32_t index
, int32_t lowStart
) const {
366 int32_t start
= scriptStarts
[index
];
367 if((start
& 0xff) < (lowStart
& 0xff)) {
370 table
[index
] = (uint8_t)(lowStart
>> 8);
371 int32_t limit
= scriptStarts
[index
+ 1];
372 lowStart
= ((lowStart
& 0xff00) + ((limit
& 0xff00) - (start
& 0xff00))) | (limit
& 0xff);
377 CollationData::addHighScriptRange(uint8_t table
[], int32_t index
, int32_t highLimit
) const {
378 int32_t limit
= scriptStarts
[index
+ 1];
379 if((limit
& 0xff) > (highLimit
& 0xff)) {
382 int32_t start
= scriptStarts
[index
];
383 highLimit
= ((highLimit
& 0xff00) - ((limit
& 0xff00) - (start
& 0xff00))) | (start
& 0xff);
384 table
[index
] = (uint8_t)(highLimit
>> 8);
390 #endif // !UCONFIG_NO_COLLATION