]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2012-2014, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ******************************************************************************* | |
6 | * collationdata.cpp | |
7 | * | |
8 | * created on: 2012jul28 | |
9 | * created by: Markus W. Scherer | |
10 | */ | |
11 | ||
12 | #include "unicode/utypes.h" | |
13 | ||
14 | #if !UCONFIG_NO_COLLATION | |
15 | ||
16 | #include "unicode/ucol.h" | |
17 | #include "unicode/udata.h" | |
18 | #include "unicode/uscript.h" | |
19 | #include "cmemory.h" | |
20 | #include "collation.h" | |
21 | #include "collationdata.h" | |
22 | #include "uassert.h" | |
23 | #include "utrie2.h" | |
24 | ||
25 | U_NAMESPACE_BEGIN | |
26 | ||
27 | uint32_t | |
28 | CollationData::getIndirectCE32(uint32_t ce32) const { | |
29 | U_ASSERT(Collation::isSpecialCE32(ce32)); | |
30 | int32_t tag = Collation::tagFromCE32(ce32); | |
31 | if(tag == Collation::DIGIT_TAG) { | |
32 | // Fetch the non-numeric-collation CE32. | |
33 | ce32 = ce32s[Collation::indexFromCE32(ce32)]; | |
34 | } else if(tag == Collation::LEAD_SURROGATE_TAG) { | |
35 | ce32 = Collation::UNASSIGNED_CE32; | |
36 | } else if(tag == Collation::U0000_TAG) { | |
37 | // Fetch the normal ce32 for U+0000. | |
38 | ce32 = ce32s[0]; | |
39 | } | |
40 | return ce32; | |
41 | } | |
42 | ||
43 | uint32_t | |
44 | CollationData::getFinalCE32(uint32_t ce32) const { | |
45 | if(Collation::isSpecialCE32(ce32)) { | |
46 | ce32 = getIndirectCE32(ce32); | |
47 | } | |
48 | return ce32; | |
49 | } | |
50 | ||
51 | uint32_t | |
52 | CollationData::getFirstPrimaryForGroup(int32_t script) const { | |
53 | int32_t index = findScript(script); | |
54 | if(index < 0) { | |
55 | return 0; | |
56 | } | |
57 | uint32_t head = scripts[index]; | |
58 | return (head & 0xff00) << 16; | |
59 | } | |
60 | ||
61 | uint32_t | |
62 | CollationData::getLastPrimaryForGroup(int32_t script) const { | |
63 | int32_t index = findScript(script); | |
64 | if(index < 0) { | |
65 | return 0; | |
66 | } | |
67 | uint32_t head = scripts[index]; | |
68 | uint32_t lastByte = head & 0xff; | |
69 | return ((lastByte + 1) << 24) - 1; | |
70 | } | |
71 | ||
72 | int32_t | |
73 | CollationData::getGroupForPrimary(uint32_t p) const { | |
74 | p >>= 24; // Reordering groups are distinguished by primary lead bytes. | |
75 | for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) { | |
76 | uint32_t lastByte = scripts[i] & 0xff; | |
77 | if(p <= lastByte) { | |
78 | return scripts[i + 2]; | |
79 | } | |
80 | } | |
81 | return -1; | |
82 | } | |
83 | ||
84 | int32_t | |
85 | CollationData::findScript(int32_t script) const { | |
86 | if(script < 0 || 0xffff < script) { return -1; } | |
87 | for(int32_t i = 0; i < scriptsLength;) { | |
88 | int32_t limit = i + 2 + scripts[i + 1]; | |
89 | for(int32_t j = i + 2; j < limit; ++j) { | |
90 | if(script == scripts[j]) { return i; } | |
91 | } | |
92 | i = limit; | |
93 | } | |
94 | return -1; | |
95 | } | |
96 | ||
97 | int32_t | |
98 | CollationData::getEquivalentScripts(int32_t script, | |
99 | int32_t dest[], int32_t capacity, | |
100 | UErrorCode &errorCode) const { | |
101 | if(U_FAILURE(errorCode)) { return 0; } | |
102 | int32_t i = findScript(script); | |
103 | if(i < 0) { return 0; } | |
104 | int32_t length = scripts[i + 1]; | |
105 | U_ASSERT(length != 0); | |
106 | if(length > capacity) { | |
107 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
108 | return length; | |
109 | } | |
110 | i += 2; | |
111 | dest[0] = scripts[i++]; | |
112 | for(int32_t j = 1; j < length; ++j) { | |
113 | script = scripts[i++]; | |
114 | // Sorted insertion. | |
115 | for(int32_t k = j;; --k) { | |
116 | // Invariant: dest[k] is free to receive either script or dest[k - 1]. | |
117 | if(k > 0 && script < dest[k - 1]) { | |
118 | dest[k] = dest[k - 1]; | |
119 | } else { | |
120 | dest[k] = script; | |
121 | break; | |
122 | } | |
123 | } | |
124 | } | |
125 | return length; | |
126 | } | |
127 | ||
128 | void | |
129 | CollationData::makeReorderTable(const int32_t *reorder, int32_t length, | |
130 | uint8_t table[256], UErrorCode &errorCode) const { | |
131 | if(U_FAILURE(errorCode)) { return; } | |
132 | ||
133 | // Initialize the table. | |
134 | // Never reorder special low and high primary lead bytes. | |
135 | int32_t lowByte; | |
136 | for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) { | |
137 | table[lowByte] = lowByte; | |
138 | } | |
139 | // lowByte == 03 | |
140 | ||
141 | int32_t highByte; | |
142 | for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) { | |
143 | table[highByte] = highByte; | |
144 | } | |
145 | // highByte == FE | |
146 | ||
147 | // Set intermediate bytes to 0 to indicate that they have not been set yet. | |
148 | for(int32_t i = lowByte; i <= highByte; ++i) { | |
149 | table[i] = 0; | |
150 | } | |
151 | ||
152 | // Get the set of special reorder codes in the input list. | |
153 | // This supports up to 32 special reorder codes; | |
154 | // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. | |
155 | uint32_t specials = 0; | |
156 | for(int32_t i = 0; i < length; ++i) { | |
157 | int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; | |
158 | if(0 <= reorderCode && reorderCode <= 31) { | |
159 | specials |= (uint32_t)1 << reorderCode; | |
160 | } | |
161 | } | |
162 | ||
163 | // Start the reordering with the special low reorder codes that do not occur in the input. | |
164 | for(int32_t i = 0;; i += 3) { | |
165 | if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes. | |
166 | int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST; | |
167 | if(reorderCode < 0) { break; } // Went beyond special reorder codes. | |
168 | if((specials & ((uint32_t)1 << reorderCode)) == 0) { | |
169 | int32_t head = scripts[i]; | |
170 | int32_t firstByte = head >> 8; | |
171 | int32_t lastByte = head & 0xff; | |
172 | do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); | |
173 | } | |
174 | } | |
175 | ||
176 | // Reorder according to the input scripts, continuing from the bottom of the bytes range. | |
177 | for(int32_t i = 0; i < length;) { | |
178 | int32_t script = reorder[i++]; | |
179 | if(script == USCRIPT_UNKNOWN) { | |
180 | // Put the remaining scripts at the top. | |
181 | while(i < length) { | |
182 | script = reorder[--length]; | |
183 | if(script == USCRIPT_UNKNOWN || // Must occur at most once. | |
184 | script == UCOL_REORDER_CODE_DEFAULT) { | |
185 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
186 | return; | |
187 | } | |
188 | int32_t index = findScript(script); | |
189 | if(index < 0) { continue; } | |
190 | int32_t head = scripts[index]; | |
191 | int32_t firstByte = head >> 8; | |
192 | int32_t lastByte = head & 0xff; | |
193 | if(table[firstByte] != 0) { // Duplicate or equivalent script. | |
194 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
195 | return; | |
196 | } | |
197 | do { table[lastByte--] = highByte--; } while(firstByte <= lastByte); | |
198 | } | |
199 | break; | |
200 | } | |
201 | if(script == UCOL_REORDER_CODE_DEFAULT) { | |
202 | // The default code must be the only one in the list, and that is handled by the caller. | |
203 | // Otherwise it must not be used. | |
204 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
205 | return; | |
206 | } | |
207 | int32_t index = findScript(script); | |
208 | if(index < 0) { continue; } | |
209 | int32_t head = scripts[index]; | |
210 | int32_t firstByte = head >> 8; | |
211 | int32_t lastByte = head & 0xff; | |
212 | if(table[firstByte] != 0) { // Duplicate or equivalent script. | |
213 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
214 | return; | |
215 | } | |
216 | do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); | |
217 | } | |
218 | ||
219 | // Put all remaining scripts into the middle. | |
220 | // Avoid table[0] which must remain 0. | |
221 | for(int32_t i = 1; i <= 0xff; ++i) { | |
222 | if(table[i] == 0) { table[i] = lowByte++; } | |
223 | } | |
224 | U_ASSERT(lowByte == highByte + 1); | |
225 | } | |
226 | ||
227 | U_NAMESPACE_END | |
228 | ||
229 | #endif // !UCONFIG_NO_COLLATION |