]>
Commit | Line | Data |
---|---|---|
0f5d89e8 A |
1 | // © 2017 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | // norms.cpp | |
5 | // created: 2017jun04 Markus W. Scherer | |
6 | // (pulled out of n2builder.cpp) | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_NORMALIZATION | |
11 | ||
12 | #include <stdio.h> | |
13 | #include <stdlib.h> | |
14 | #include "unicode/errorcode.h" | |
3d1f044b | 15 | #include "unicode/umutablecptrie.h" |
0f5d89e8 A |
16 | #include "unicode/unistr.h" |
17 | #include "unicode/utf16.h" | |
18 | #include "normalizer2impl.h" | |
19 | #include "norms.h" | |
20 | #include "toolutil.h" | |
0f5d89e8 A |
21 | #include "uvectr32.h" |
22 | ||
23 | U_NAMESPACE_BEGIN | |
24 | ||
25 | void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) { | |
26 | if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { | |
27 | if(cc==0) { | |
28 | fLastStarterIndex=fLength; | |
29 | } | |
30 | fArray[fLength++]=(c<<8)|cc; | |
31 | return; | |
32 | } | |
33 | // Let this character bubble back to its canonical order. | |
34 | int32_t i=fLength-1; | |
35 | while(i>fLastStarterIndex && ccAt(i)>cc) { | |
36 | --i; | |
37 | } | |
38 | ++i; // after the last starter or prevCC<=cc | |
39 | // Move this and the following characters forward one to make space. | |
40 | for(int32_t j=fLength; i<j; --j) { | |
41 | fArray[j]=fArray[j-1]; | |
42 | } | |
43 | fArray[i]=(c<<8)|cc; | |
44 | ++fLength; | |
45 | fDidReorder=TRUE; | |
46 | } | |
47 | ||
48 | void BuilderReorderingBuffer::toString(UnicodeString &dest) const { | |
49 | dest.remove(); | |
50 | for(int32_t i=0; i<fLength; ++i) { | |
51 | dest.append(charAt(i)); | |
52 | } | |
53 | } | |
54 | ||
55 | UChar32 Norm::combine(UChar32 trail) const { | |
56 | int32_t length; | |
57 | const CompositionPair *pairs=getCompositionPairs(length); | |
58 | for(int32_t i=0; i<length; ++i) { | |
59 | if(trail==pairs[i].trail) { | |
60 | return pairs[i].composite; | |
61 | } | |
62 | if(trail<pairs[i].trail) { | |
63 | break; | |
64 | } | |
65 | } | |
66 | return U_SENTINEL; | |
67 | } | |
68 | ||
69 | Norms::Norms(UErrorCode &errorCode) { | |
3d1f044b | 70 | normTrie = umutablecptrie_open(0, 0, &errorCode); |
0f5d89e8 A |
71 | normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); |
72 | // Default "inert" Norm struct at index 0. Practically immutable. | |
73 | norms=allocNorm(); | |
74 | norms->type=Norm::INERT; | |
75 | } | |
76 | ||
77 | Norms::~Norms() { | |
3d1f044b | 78 | umutablecptrie_close(normTrie); |
0f5d89e8 A |
79 | int32_t normsLength=utm_countItems(normMem); |
80 | for(int32_t i=1; i<normsLength; ++i) { | |
81 | delete norms[i].mapping; | |
82 | delete norms[i].rawMapping; | |
83 | delete norms[i].compositions; | |
84 | } | |
85 | utm_close(normMem); | |
86 | } | |
87 | ||
88 | Norm *Norms::allocNorm() { | |
89 | Norm *p=(Norm *)utm_alloc(normMem); | |
90 | norms=(Norm *)utm_getStart(normMem); // in case it got reallocated | |
91 | return p; | |
92 | } | |
93 | ||
94 | Norm *Norms::getNorm(UChar32 c) { | |
3d1f044b | 95 | uint32_t i = umutablecptrie_get(normTrie, c); |
0f5d89e8 A |
96 | if(i==0) { |
97 | return nullptr; | |
98 | } | |
99 | return norms+i; | |
100 | } | |
101 | ||
102 | const Norm *Norms::getNorm(UChar32 c) const { | |
3d1f044b | 103 | uint32_t i = umutablecptrie_get(normTrie, c); |
0f5d89e8 A |
104 | if(i==0) { |
105 | return nullptr; | |
106 | } | |
107 | return norms+i; | |
108 | } | |
109 | ||
110 | const Norm &Norms::getNormRef(UChar32 c) const { | |
3d1f044b | 111 | return norms[umutablecptrie_get(normTrie, c)]; |
0f5d89e8 A |
112 | } |
113 | ||
114 | Norm *Norms::createNorm(UChar32 c) { | |
3d1f044b | 115 | uint32_t i=umutablecptrie_get(normTrie, c); |
0f5d89e8 A |
116 | if(i!=0) { |
117 | return norms+i; | |
118 | } else { | |
119 | /* allocate Norm */ | |
120 | Norm *p=allocNorm(); | |
121 | IcuToolErrorCode errorCode("gennorm2/createNorm()"); | |
3d1f044b | 122 | umutablecptrie_set(normTrie, c, (uint32_t)(p - norms), errorCode); |
0f5d89e8 A |
123 | return p; |
124 | } | |
125 | } | |
126 | ||
127 | void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const { | |
128 | int32_t length=mapping.length(); | |
129 | U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK); | |
130 | const char16_t *s=mapping.getBuffer(); | |
131 | int32_t i=0; | |
132 | UChar32 c; | |
133 | while(i<length) { | |
134 | U16_NEXT(s, i, length, c); | |
135 | buffer.append(c, getCC(c)); | |
136 | } | |
137 | if(buffer.didReorder()) { | |
138 | buffer.toString(mapping); | |
139 | } | |
140 | } | |
141 | ||
142 | UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const { | |
143 | if((highCC-lowCC)>=2) { | |
144 | int32_t length; | |
145 | const CompositionPair *pairs=norm.getCompositionPairs(length); | |
146 | for(int32_t i=0; i<length; ++i) { | |
147 | uint8_t trailCC=getCC(pairs[i].trail); | |
148 | if(lowCC<trailCC && trailCC<highCC) { | |
149 | return TRUE; | |
150 | } | |
151 | } | |
152 | } | |
153 | return FALSE; | |
154 | } | |
155 | ||
0f5d89e8 | 156 | void Norms::enumRanges(Enumerator &e) { |
3d1f044b A |
157 | UChar32 start = 0, end; |
158 | uint32_t i; | |
159 | while ((end = umutablecptrie_getRange(normTrie, start, UCPMAP_RANGE_NORMAL, 0, | |
160 | nullptr, nullptr, &i)) >= 0) { | |
161 | if (i > 0) { | |
162 | e.rangeHandler(start, end, norms[i]); | |
163 | } | |
164 | start = end + 1; | |
165 | } | |
0f5d89e8 A |
166 | } |
167 | ||
168 | Norms::Enumerator::~Enumerator() {} | |
169 | ||
0f5d89e8 A |
170 | void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { |
171 | if(norm.mappingType!=Norm::ROUND_TRIP) { return; } | |
172 | if(start!=end) { | |
173 | fprintf(stderr, | |
174 | "gennorm2 error: same round-trip mapping for " | |
175 | "more than 1 code point U+%04lX..U+%04lX\n", | |
176 | (long)start, (long)end); | |
177 | exit(U_INVALID_FORMAT_ERROR); | |
178 | } | |
179 | if(norm.cc!=0) { | |
180 | fprintf(stderr, | |
181 | "gennorm2 error: " | |
182 | "U+%04lX has a round-trip mapping and ccc!=0, " | |
183 | "not possible in Unicode normalization\n", | |
184 | (long)start); | |
185 | exit(U_INVALID_FORMAT_ERROR); | |
186 | } | |
187 | // setRoundTripMapping() ensured that there are exactly two code points. | |
188 | const UnicodeString &m=*norm.mapping; | |
189 | UChar32 lead=m.char32At(0); | |
190 | UChar32 trail=m.char32At(m.length()-1); | |
191 | if(norms.getCC(lead)!=0) { | |
192 | fprintf(stderr, | |
193 | "gennorm2 error: " | |
194 | "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " | |
195 | "not possible in Unicode normalization\n", | |
196 | (long)start, (long)lead); | |
197 | exit(U_INVALID_FORMAT_ERROR); | |
198 | } | |
199 | // Flag for trailing character. | |
200 | norms.createNorm(trail)->combinesBack=TRUE; | |
201 | // Insert (trail, composite) pair into compositions list for the lead character. | |
202 | IcuToolErrorCode errorCode("gennorm2/addComposition()"); | |
203 | Norm *leadNorm=norms.createNorm(lead); | |
204 | UVector32 *compositions=leadNorm->compositions; | |
205 | int32_t i; | |
206 | if(compositions==nullptr) { | |
207 | compositions=leadNorm->compositions=new UVector32(errorCode); | |
208 | i=0; // "insert" the first pair at index 0 | |
209 | } else { | |
210 | // Insertion sort, and check for duplicate trail characters. | |
211 | int32_t length; | |
212 | const CompositionPair *pairs=leadNorm->getCompositionPairs(length); | |
213 | for(i=0; i<length; ++i) { | |
214 | if(trail==pairs[i].trail) { | |
215 | fprintf(stderr, | |
216 | "gennorm2 error: same round-trip mapping for " | |
217 | "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", | |
218 | (long)start, (long)lead, (long)trail); | |
219 | exit(U_INVALID_FORMAT_ERROR); | |
220 | } | |
221 | if(trail<pairs[i].trail) { | |
222 | break; | |
223 | } | |
224 | } | |
225 | } | |
226 | compositions->insertElementAt(trail, 2*i, errorCode); | |
227 | compositions->insertElementAt(start, 2*i+1, errorCode); | |
228 | } | |
229 | ||
230 | void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { | |
231 | if(!norm.hasMapping()) { return; } | |
232 | const UnicodeString &m=*norm.mapping; | |
233 | UnicodeString *decomposed=nullptr; | |
234 | const UChar *s=toUCharPtr(m.getBuffer()); | |
235 | int32_t length=m.length(); | |
236 | int32_t prev, i=0; | |
237 | UChar32 c; | |
238 | while(i<length) { | |
239 | prev=i; | |
240 | U16_NEXT(s, i, length, c); | |
241 | if(start<=c && c<=end) { | |
242 | fprintf(stderr, | |
243 | "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", | |
244 | (long)c); | |
245 | exit(U_INVALID_FORMAT_ERROR); | |
246 | } | |
247 | const Norm &cNorm=norms.getNormRef(c); | |
248 | if(cNorm.hasMapping()) { | |
249 | if(norm.mappingType==Norm::ROUND_TRIP) { | |
250 | if(prev==0) { | |
251 | if(cNorm.mappingType!=Norm::ROUND_TRIP) { | |
252 | fprintf(stderr, | |
253 | "gennorm2 error: " | |
254 | "U+%04lX's round-trip mapping's starter " | |
255 | "U+%04lX one-way-decomposes, " | |
256 | "not possible in Unicode normalization\n", | |
257 | (long)start, (long)c); | |
258 | exit(U_INVALID_FORMAT_ERROR); | |
259 | } | |
260 | uint8_t myTrailCC=norms.getCC(m.char32At(i)); | |
261 | UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); | |
262 | uint8_t cTrailCC=norms.getCC(cTrailChar); | |
263 | if(cTrailCC>myTrailCC) { | |
264 | fprintf(stderr, | |
265 | "gennorm2 error: " | |
266 | "U+%04lX's round-trip mapping's starter " | |
267 | "U+%04lX decomposes and the " | |
268 | "inner/earlier tccc=%hu > outer/following tccc=%hu, " | |
269 | "not possible in Unicode normalization\n", | |
270 | (long)start, (long)c, | |
271 | (short)cTrailCC, (short)myTrailCC); | |
272 | exit(U_INVALID_FORMAT_ERROR); | |
273 | } | |
274 | } else { | |
275 | fprintf(stderr, | |
276 | "gennorm2 error: " | |
277 | "U+%04lX's round-trip mapping's non-starter " | |
278 | "U+%04lX decomposes, " | |
279 | "not possible in Unicode normalization\n", | |
280 | (long)start, (long)c); | |
281 | exit(U_INVALID_FORMAT_ERROR); | |
282 | } | |
283 | } | |
284 | if(decomposed==nullptr) { | |
285 | decomposed=new UnicodeString(m, 0, prev); | |
286 | } | |
287 | decomposed->append(*cNorm.mapping); | |
288 | } else if(Hangul::isHangul(c)) { | |
289 | UChar buffer[3]; | |
290 | int32_t hangulLength=Hangul::decompose(c, buffer); | |
291 | if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { | |
292 | fprintf(stderr, | |
293 | "gennorm2 error: " | |
294 | "U+%04lX's round-trip mapping's non-starter " | |
295 | "U+%04lX decomposes, " | |
296 | "not possible in Unicode normalization\n", | |
297 | (long)start, (long)c); | |
298 | exit(U_INVALID_FORMAT_ERROR); | |
299 | } | |
300 | if(decomposed==nullptr) { | |
301 | decomposed=new UnicodeString(m, 0, prev); | |
302 | } | |
303 | decomposed->append(buffer, hangulLength); | |
304 | } else if(decomposed!=nullptr) { | |
305 | decomposed->append(m, prev, i-prev); | |
306 | } | |
307 | } | |
308 | if(decomposed!=nullptr) { | |
309 | if(norm.rawMapping==nullptr) { | |
310 | // Remember the original mapping when decomposing recursively. | |
311 | norm.rawMapping=norm.mapping; | |
312 | } else { | |
313 | delete norm.mapping; | |
314 | } | |
315 | norm.mapping=decomposed; | |
316 | // Not norm.setMappingCP(); because the original mapping | |
317 | // is most likely to be encodable as a delta. | |
318 | didDecompose|=TRUE; | |
319 | } | |
320 | } | |
321 | ||
322 | U_NAMESPACE_END | |
323 | ||
324 | #endif // #if !UCONFIG_NO_NORMALIZATION |