2 *******************************************************************************
3 * Copyright (C) 2012-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * created on: 2012jul28
9 * created by: Markus W. Scherer
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
20 #include "collation.h"
21 #include "collationdata.h"
29 CollationData::getIndirectCE32(uint32_t ce32
) const {
30 U_ASSERT(Collation::isSpecialCE32(ce32
));
31 int32_t tag
= Collation::tagFromCE32(ce32
);
32 if(tag
== Collation::DIGIT_TAG
) {
33 // Fetch the non-numeric-collation CE32.
34 ce32
= ce32s
[Collation::indexFromCE32(ce32
)];
35 } else if(tag
== Collation::LEAD_SURROGATE_TAG
) {
36 ce32
= Collation::UNASSIGNED_CE32
;
37 } else if(tag
== Collation::U0000_TAG
) {
38 // Fetch the normal ce32 for U+0000.
45 CollationData::getFinalCE32(uint32_t ce32
) const {
46 if(Collation::isSpecialCE32(ce32
)) {
47 ce32
= getIndirectCE32(ce32
);
53 CollationData::getSingleCE(UChar32 c
, UErrorCode
&errorCode
) const {
54 if(U_FAILURE(errorCode
)) { return 0; }
55 // Keep parallel with CollationDataBuilder::getSingleCE().
56 const CollationData
*d
;
57 uint32_t ce32
= getCE32(c
);
58 if(ce32
== Collation::FALLBACK_CE32
) {
60 ce32
= base
->getCE32(c
);
64 while(Collation::isSpecialCE32(ce32
)) {
65 switch(Collation::tagFromCE32(ce32
)) {
66 case Collation::LATIN_EXPANSION_TAG
:
67 case Collation::BUILDER_DATA_TAG
:
68 case Collation::PREFIX_TAG
:
69 case Collation::CONTRACTION_TAG
:
70 case Collation::HANGUL_TAG
:
71 case Collation::LEAD_SURROGATE_TAG
:
72 errorCode
= U_UNSUPPORTED_ERROR
;
74 case Collation::FALLBACK_TAG
:
75 case Collation::RESERVED_TAG_3
:
76 errorCode
= U_INTERNAL_PROGRAM_ERROR
;
78 case Collation::LONG_PRIMARY_TAG
:
79 return Collation::ceFromLongPrimaryCE32(ce32
);
80 case Collation::LONG_SECONDARY_TAG
:
81 return Collation::ceFromLongSecondaryCE32(ce32
);
82 case Collation::EXPANSION32_TAG
:
83 if(Collation::lengthFromCE32(ce32
) == 1) {
84 ce32
= d
->ce32s
[Collation::indexFromCE32(ce32
)];
87 errorCode
= U_UNSUPPORTED_ERROR
;
90 case Collation::EXPANSION_TAG
: {
91 if(Collation::lengthFromCE32(ce32
) == 1) {
92 return d
->ces
[Collation::indexFromCE32(ce32
)];
94 errorCode
= U_UNSUPPORTED_ERROR
;
98 case Collation::DIGIT_TAG
:
99 // Fetch the non-numeric-collation CE32 and continue.
100 ce32
= d
->ce32s
[Collation::indexFromCE32(ce32
)];
102 case Collation::U0000_TAG
:
104 // Fetch the normal ce32 for U+0000 and continue.
107 case Collation::OFFSET_TAG
:
108 return d
->getCEFromOffsetCE32(c
, ce32
);
109 case Collation::IMPLICIT_TAG
:
110 return Collation::unassignedCEFromCodePoint(c
);
113 return Collation::ceFromSimpleCE32(ce32
);
117 CollationData::getFirstPrimaryForGroup(int32_t script
) const {
118 int32_t index
= getScriptIndex(script
);
119 return index
== 0 ? 0 : (uint32_t)scriptStarts
[index
] << 16;
123 CollationData::getLastPrimaryForGroup(int32_t script
) const {
124 int32_t index
= getScriptIndex(script
);
128 uint32_t limit
= scriptStarts
[index
+ 1];
129 return (limit
<< 16) - 1;
133 CollationData::getGroupForPrimary(uint32_t p
) const {
135 if(p
< scriptStarts
[1] || scriptStarts
[scriptStartsLength
- 1] <= p
) {
139 while(p
>= scriptStarts
[index
+ 1]) { ++index
; }
140 for(int32_t i
= 0; i
< numScripts
; ++i
) {
141 if(scriptsIndex
[i
] == index
) {
145 for(int32_t i
= 0; i
< MAX_NUM_SPECIAL_REORDER_CODES
; ++i
) {
146 if(scriptsIndex
[numScripts
+ i
] == index
) {
147 return UCOL_REORDER_CODE_FIRST
+ i
;
154 CollationData::getScriptIndex(int32_t script
) const {
157 } else if(script
< numScripts
) {
158 return scriptsIndex
[script
];
159 } else if(script
< UCOL_REORDER_CODE_FIRST
) {
162 script
-= UCOL_REORDER_CODE_FIRST
;
163 if(script
< MAX_NUM_SPECIAL_REORDER_CODES
) {
164 return scriptsIndex
[numScripts
+ script
];
172 CollationData::getEquivalentScripts(int32_t script
,
173 int32_t dest
[], int32_t capacity
,
174 UErrorCode
&errorCode
) const {
175 if(U_FAILURE(errorCode
)) { return 0; }
176 int32_t index
= getScriptIndex(script
);
177 if(index
== 0) { return 0; }
178 if(script
>= UCOL_REORDER_CODE_FIRST
) {
179 // Special groups have no aliases.
183 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
189 for(int32_t i
= 0; i
< numScripts
; ++i
) {
190 if(scriptsIndex
[i
] == index
) {
191 if(length
< capacity
) {
197 if(length
> capacity
) {
198 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
204 CollationData::makeReorderRanges(const int32_t *reorder
, int32_t length
,
205 UVector32
&ranges
, UErrorCode
&errorCode
) const {
206 makeReorderRanges(reorder
, length
, FALSE
, ranges
, errorCode
);
210 CollationData::makeReorderRanges(const int32_t *reorder
, int32_t length
,
212 UVector32
&ranges
, UErrorCode
&errorCode
) const {
213 if(U_FAILURE(errorCode
)) { return; }
214 ranges
.removeAllElements();
215 if(length
== 0 || (length
== 1 && reorder
[0] == USCRIPT_UNKNOWN
)) {
219 // Maps each script-or-group range to a new lead byte.
220 uint8_t table
[MAX_NUM_SCRIPT_RANGES
];
221 uprv_memset(table
, 0, sizeof(table
));
224 // Set "don't care" values for reserved ranges.
225 int32_t index
= scriptsIndex
[
226 numScripts
+ REORDER_RESERVED_BEFORE_LATIN
- UCOL_REORDER_CODE_FIRST
];
230 index
= scriptsIndex
[
231 numScripts
+ REORDER_RESERVED_AFTER_LATIN
- UCOL_REORDER_CODE_FIRST
];
237 // Never reorder special low and high primary lead bytes.
238 U_ASSERT(scriptStartsLength
>= 2);
239 U_ASSERT(scriptStarts
[0] == 0);
240 int32_t lowStart
= scriptStarts
[1];
241 U_ASSERT(lowStart
== ((Collation::MERGE_SEPARATOR_BYTE
+ 1) << 8));
242 int32_t highLimit
= scriptStarts
[scriptStartsLength
- 1];
243 U_ASSERT(highLimit
== (Collation::TRAIL_WEIGHT_BYTE
<< 8));
245 // Get the set of special reorder codes in the input list.
246 // This supports a fixed number of special reorder codes;
247 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
248 uint32_t specials
= 0;
249 for(int32_t i
= 0; i
< length
; ++i
) {
250 int32_t reorderCode
= reorder
[i
] - UCOL_REORDER_CODE_FIRST
;
251 if(0 <= reorderCode
&& reorderCode
< MAX_NUM_SPECIAL_REORDER_CODES
) {
252 specials
|= (uint32_t)1 << reorderCode
;
256 // Start the reordering with the special low reorder codes that do not occur in the input.
257 for(int32_t i
= 0; i
< MAX_NUM_SPECIAL_REORDER_CODES
; ++i
) {
258 int32_t index
= scriptsIndex
[numScripts
+ i
];
259 if(index
!= 0 && (specials
& ((uint32_t)1 << i
)) == 0) {
260 lowStart
= addLowScriptRange(table
, index
, lowStart
);
264 // Skip the reserved range before Latin if Latin is the first script,
265 // so that we do not move it unnecessarily.
266 int32_t skippedReserved
= 0;
267 if(specials
== 0 && reorder
[0] == USCRIPT_LATIN
&& !latinMustMove
) {
268 int32_t index
= scriptsIndex
[USCRIPT_LATIN
];
269 U_ASSERT(index
!= 0);
270 int32_t start
= scriptStarts
[index
];
271 U_ASSERT(lowStart
<= start
);
272 skippedReserved
= start
- lowStart
;
276 // Reorder according to the input scripts, continuing from the bottom of the primary range.
277 int32_t originalLength
= length
; // length will be decremented if "others" is in the list.
278 UBool hasReorderToEnd
= FALSE
;
279 for(int32_t i
= 0; i
< length
;) {
280 int32_t script
= reorder
[i
++];
281 if(script
== USCRIPT_UNKNOWN
) {
282 // Put the remaining scripts at the top.
283 hasReorderToEnd
= TRUE
;
285 script
= reorder
[--length
];
286 if(script
== USCRIPT_UNKNOWN
|| // Must occur at most once.
287 script
== UCOL_REORDER_CODE_DEFAULT
) {
288 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
291 int32_t index
= getScriptIndex(script
);
292 if(index
== 0) { continue; }
293 if(table
[index
] != 0) { // Duplicate or equivalent script.
294 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
297 highLimit
= addHighScriptRange(table
, index
, highLimit
);
301 if(script
== UCOL_REORDER_CODE_DEFAULT
) {
302 // The default code must be the only one in the list, and that is handled by the caller.
303 // Otherwise it must not be used.
304 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
307 int32_t index
= getScriptIndex(script
);
308 if(index
== 0) { continue; }
309 if(table
[index
] != 0) { // Duplicate or equivalent script.
310 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
313 lowStart
= addLowScriptRange(table
, index
, lowStart
);
316 // Put all remaining scripts into the middle.
317 for(int32_t i
= 1; i
< scriptStartsLength
- 1; ++i
) {
318 int32_t leadByte
= table
[i
];
319 if(leadByte
!= 0) { continue; }
320 int32_t start
= scriptStarts
[i
];
321 if(!hasReorderToEnd
&& start
> lowStart
) {
322 // No need to move this script.
325 lowStart
= addLowScriptRange(table
, i
, lowStart
);
327 if(lowStart
> highLimit
) {
328 if((lowStart
- (skippedReserved
& 0xff00)) <= highLimit
) {
329 // Try not skipping the before-Latin reserved range.
330 makeReorderRanges(reorder
, originalLength
, TRUE
, ranges
, errorCode
);
333 // We need more primary lead bytes than available, despite the reserved ranges.
334 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
338 // Turn lead bytes into a list of (limit, offset) pairs.
339 // Encode each pair in one list element:
340 // Upper 16 bits = limit, lower 16 = signed lead byte offset.
342 for(int32_t i
= 1;; ++i
) {
343 int32_t nextOffset
= offset
;
344 while(i
< scriptStartsLength
- 1) {
345 int32_t newLeadByte
= table
[i
];
346 if(newLeadByte
== 0xff) {
347 // "Don't care" lead byte for reserved range, continue with current offset.
349 nextOffset
= newLeadByte
- (scriptStarts
[i
] >> 8);
350 if(nextOffset
!= offset
) { break; }
354 if(offset
!= 0 || i
< scriptStartsLength
- 1) {
355 ranges
.addElement(((int32_t)scriptStarts
[i
] << 16) | (offset
& 0xffff), errorCode
);
357 if(i
== scriptStartsLength
- 1) { break; }
363 CollationData::addLowScriptRange(uint8_t table
[], int32_t index
, int32_t lowStart
) const {
364 int32_t start
= scriptStarts
[index
];
365 if((start
& 0xff) < (lowStart
& 0xff)) {
368 table
[index
] = (uint8_t)(lowStart
>> 8);
369 int32_t limit
= scriptStarts
[index
+ 1];
370 lowStart
= ((lowStart
& 0xff00) + ((limit
& 0xff00) - (start
& 0xff00))) | (limit
& 0xff);
375 CollationData::addHighScriptRange(uint8_t table
[], int32_t index
, int32_t highLimit
) const {
376 int32_t limit
= scriptStarts
[index
+ 1];
377 if((limit
& 0xff) > (highLimit
& 0xff)) {
380 int32_t start
= scriptStarts
[index
];
381 highLimit
= ((highLimit
& 0xff00) - ((limit
& 0xff00) - (start
& 0xff00))) | (start
& 0xff);
382 table
[index
] = (uint8_t)(highLimit
>> 8);
388 #endif // !UCONFIG_NO_COLLATION