]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
729e4ab9 A |
3 | /* |
4 | ******************************************************************************* | |
5 | * | |
f3c0d7a5 | 6 | * Copyright (C) 2009-2016, International Business Machines |
729e4ab9 A |
7 | * Corporation and others. All Rights Reserved. |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: n2builder.cpp | |
f3c0d7a5 | 11 | * encoding: UTF-8 |
729e4ab9 A |
12 | * tab size: 8 (not used) |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2009nov25 | |
16 | * created by: Markus W. Scherer | |
17 | * | |
18 | * Builds Normalizer2 data and writes a binary .nrm file. | |
19 | * For the file format see source/common/normalizer2impl.h. | |
20 | */ | |
21 | ||
22 | #include "unicode/utypes.h" | |
23 | #include "n2builder.h" | |
24 | ||
25 | #include <stdio.h> | |
26 | #include <stdlib.h> | |
27 | #include <string.h> | |
729e4ab9 | 28 | #include <vector> |
729e4ab9 A |
29 | #include "unicode/errorcode.h" |
30 | #include "unicode/localpointer.h" | |
31 | #include "unicode/putil.h" | |
32 | #include "unicode/udata.h" | |
33 | #include "unicode/uniset.h" | |
34 | #include "unicode/unistr.h" | |
0f5d89e8 | 35 | #include "unicode/usetiter.h" |
729e4ab9 | 36 | #include "unicode/ustring.h" |
b331163b | 37 | #include "charstr.h" |
0f5d89e8 | 38 | #include "extradata.h" |
729e4ab9 A |
39 | #include "hash.h" |
40 | #include "normalizer2impl.h" | |
0f5d89e8 | 41 | #include "norms.h" |
729e4ab9 A |
42 | #include "toolutil.h" |
43 | #include "unewdata.h" | |
44 | #include "utrie2.h" | |
45 | #include "uvectr32.h" | |
b331163b | 46 | #include "writesrc.h" |
729e4ab9 A |
47 | |
48 | #if !UCONFIG_NO_NORMALIZATION | |
49 | ||
50 | /* UDataInfo cf. udata.h */ | |
51 | static UDataInfo dataInfo={ | |
52 | sizeof(UDataInfo), | |
53 | 0, | |
54 | ||
55 | U_IS_BIG_ENDIAN, | |
56 | U_CHARSET_FAMILY, | |
57 | U_SIZEOF_UCHAR, | |
58 | 0, | |
59 | ||
60 | { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ | |
0f5d89e8 A |
61 | { 3, 0, 0, 0 }, /* formatVersion */ |
62 | { 10, 0, 0, 0 } /* dataVersion (Unicode version) */ | |
729e4ab9 A |
63 | }; |
64 | ||
65 | U_NAMESPACE_BEGIN | |
66 | ||
67 | class HangulIterator { | |
68 | public: | |
69 | struct Range { | |
0f5d89e8 | 70 | UChar32 start, end; |
729e4ab9 A |
71 | }; |
72 | ||
73 | HangulIterator() : rangeIndex(0) {} | |
74 | const Range *nextRange() { | |
b331163b | 75 | if(rangeIndex<UPRV_LENGTHOF(ranges)) { |
729e4ab9 A |
76 | return ranges+rangeIndex++; |
77 | } else { | |
78 | return NULL; | |
79 | } | |
80 | } | |
729e4ab9 A |
81 | private: |
82 | static const Range ranges[4]; | |
83 | int32_t rangeIndex; | |
84 | }; | |
85 | ||
86 | const HangulIterator::Range HangulIterator::ranges[4]={ | |
0f5d89e8 A |
87 | { Hangul::JAMO_L_BASE, Hangul::JAMO_L_END }, |
88 | { Hangul::JAMO_V_BASE, Hangul::JAMO_V_END }, | |
729e4ab9 | 89 | // JAMO_T_BASE+1: not U+11A7 |
0f5d89e8 A |
90 | { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END }, |
91 | { Hangul::HANGUL_BASE, Hangul::HANGUL_END }, | |
729e4ab9 A |
92 | }; |
93 | ||
729e4ab9 | 94 | Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : |
0f5d89e8 | 95 | norms(errorCode), |
b331163b | 96 | phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL), |
0f5d89e8 | 97 | norm16Trie(nullptr), norm16TrieLength(0) { |
729e4ab9 | 98 | memset(unicodeVersion, 0, sizeof(unicodeVersion)); |
729e4ab9 | 99 | memset(indexes, 0, sizeof(indexes)); |
4388f060 | 100 | memset(smallFCD, 0, sizeof(smallFCD)); |
729e4ab9 A |
101 | } |
102 | ||
103 | Normalizer2DataBuilder::~Normalizer2DataBuilder() { | |
729e4ab9 A |
104 | utrie2_close(norm16Trie); |
105 | } | |
106 | ||
107 | void | |
108 | Normalizer2DataBuilder::setUnicodeVersion(const char *v) { | |
4388f060 A |
109 | UVersionInfo nullVersion={ 0, 0, 0, 0 }; |
110 | UVersionInfo version; | |
111 | u_versionFromString(version, v); | |
112 | if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && | |
113 | 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) | |
114 | ) { | |
115 | char buffer[U_MAX_VERSION_STRING_LENGTH]; | |
116 | u_versionToString(unicodeVersion, buffer); | |
117 | fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", | |
118 | buffer, v); | |
119 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
120 | } | |
121 | memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); | |
729e4ab9 A |
122 | } |
123 | ||
729e4ab9 A |
124 | Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { |
125 | if(p!=NULL) { | |
126 | if(p->mappingType!=Norm::NONE) { | |
127 | if( overrideHandling==OVERRIDE_NONE || | |
128 | (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) | |
129 | ) { | |
130 | fprintf(stderr, | |
131 | "error in gennorm2 phase %d: " | |
132 | "not permitted to override mapping for U+%04lX from phase %d\n", | |
133 | (int)phase, (long)c, (int)p->mappingPhase); | |
134 | exit(U_INVALID_FORMAT_ERROR); | |
135 | } | |
136 | delete p->mapping; | |
137 | p->mapping=NULL; | |
138 | } | |
139 | p->mappingPhase=phase; | |
140 | } | |
141 | return p; | |
142 | } | |
143 | ||
144 | void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { | |
145 | overrideHandling=oh; | |
146 | ++phase; | |
147 | } | |
148 | ||
149 | void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { | |
0f5d89e8 A |
150 | norms.createNorm(c)->cc=cc; |
151 | norms.ccSet.add(c); | |
729e4ab9 A |
152 | } |
153 | ||
154 | static UBool isWellFormed(const UnicodeString &s) { | |
155 | UErrorCode errorCode=U_ZERO_ERROR; | |
f3c0d7a5 | 156 | u_strToUTF8(NULL, 0, NULL, toUCharPtr(s.getBuffer()), s.length(), &errorCode); |
729e4ab9 A |
157 | return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; |
158 | } | |
159 | ||
160 | void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { | |
161 | if(!isWellFormed(m)) { | |
162 | fprintf(stderr, | |
163 | "error in gennorm2 phase %d: " | |
164 | "illegal one-way mapping from U+%04lX to malformed string\n", | |
165 | (int)phase, (long)c); | |
166 | exit(U_INVALID_FORMAT_ERROR); | |
167 | } | |
0f5d89e8 | 168 | Norm *p=checkNormForMapping(norms.createNorm(c), c); |
729e4ab9 A |
169 | p->mapping=new UnicodeString(m); |
170 | p->mappingType=Norm::ONE_WAY; | |
171 | p->setMappingCP(); | |
0f5d89e8 | 172 | norms.mappingSet.add(c); |
729e4ab9 A |
173 | } |
174 | ||
175 | void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { | |
176 | if(U_IS_SURROGATE(c)) { | |
177 | fprintf(stderr, | |
178 | "error in gennorm2 phase %d: " | |
179 | "illegal round-trip mapping from surrogate code point U+%04lX\n", | |
180 | (int)phase, (long)c); | |
181 | exit(U_INVALID_FORMAT_ERROR); | |
182 | } | |
183 | if(!isWellFormed(m)) { | |
184 | fprintf(stderr, | |
185 | "error in gennorm2 phase %d: " | |
186 | "illegal round-trip mapping from U+%04lX to malformed string\n", | |
187 | (int)phase, (long)c); | |
188 | exit(U_INVALID_FORMAT_ERROR); | |
189 | } | |
f3c0d7a5 | 190 | int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length()); |
729e4ab9 A |
191 | if(numCP!=2) { |
192 | fprintf(stderr, | |
193 | "error in gennorm2 phase %d: " | |
194 | "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", | |
195 | (int)phase, (long)c, (int)numCP); | |
196 | exit(U_INVALID_FORMAT_ERROR); | |
197 | } | |
0f5d89e8 | 198 | Norm *p=checkNormForMapping(norms.createNorm(c), c); |
729e4ab9 A |
199 | p->mapping=new UnicodeString(m); |
200 | p->mappingType=Norm::ROUND_TRIP; | |
201 | p->mappingCP=U_SENTINEL; | |
0f5d89e8 | 202 | norms.mappingSet.add(c); |
729e4ab9 A |
203 | } |
204 | ||
205 | void Normalizer2DataBuilder::removeMapping(UChar32 c) { | |
0f5d89e8 A |
206 | // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data. |
207 | Norm *p=checkNormForMapping(norms.createNorm(c), c); | |
208 | p->mappingType=Norm::REMOVED; | |
209 | norms.mappingSet.add(c); | |
729e4ab9 A |
210 | } |
211 | ||
0f5d89e8 A |
212 | UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer, |
213 | Norm::MappingType mappingType) const { | |
729e4ab9 | 214 | if(buffer.isEmpty()) { |
0f5d89e8 | 215 | return FALSE; // Maps-to-empty-string is no boundary of any kind. |
729e4ab9 A |
216 | } |
217 | int32_t lastStarterIndex=buffer.lastStarterIndex(); | |
218 | if(lastStarterIndex<0) { | |
0f5d89e8 A |
219 | return FALSE; // no starter |
220 | } | |
221 | const int32_t lastIndex=buffer.length()-1; | |
222 | if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) { | |
223 | // One-way mapping where after the last starter is at least one combining mark | |
224 | // with a combining class greater than 1, | |
225 | // which means that another combining mark can reorder before it. | |
226 | // By contrast, in a round-trip mapping this does not prevent a boundary as long as | |
227 | // the starter or composite does not combine-forward with a following combining mark. | |
228 | return FALSE; | |
729e4ab9 A |
229 | } |
230 | UChar32 starter=buffer.charAt(lastStarterIndex); | |
0f5d89e8 A |
231 | if(lastStarterIndex==0 && norms.combinesBack(starter)) { |
232 | // The last starter is at the beginning of the mapping and combines backward. | |
233 | return FALSE; | |
234 | } | |
235 | if(Hangul::isJamoL(starter) || | |
236 | (Hangul::isJamoV(starter) && | |
237 | 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) { | |
729e4ab9 A |
238 | // A Jamo leading consonant or an LV pair combines-forward if it is at the end, |
239 | // otherwise it is blocked. | |
0f5d89e8 | 240 | return lastStarterIndex!=lastIndex; |
729e4ab9 | 241 | } |
4388f060 | 242 | // Note: There can be no Hangul syllable in the fully decomposed mapping. |
0f5d89e8 A |
243 | |
244 | // Multiple starters can combine into one. | |
245 | // Look for the first of the last sequence of starters, excluding Jamos. | |
246 | int32_t i=lastStarterIndex; | |
247 | UChar32 c; | |
248 | while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) { | |
249 | starter=c; | |
250 | --i; | |
251 | } | |
252 | // Compose as far as possible, and see if further compositions with | |
253 | // characters following this mapping are possible. | |
254 | const Norm *starterNorm=norms.getNorm(starter); | |
255 | if(i==lastStarterIndex && | |
256 | (starterNorm==nullptr || starterNorm->compositions==nullptr)) { | |
257 | return TRUE; // The last starter does not combine forward. | |
729e4ab9 | 258 | } |
729e4ab9 | 259 | uint8_t prevCC=0; |
0f5d89e8 A |
260 | while(++i<buffer.length()) { |
261 | uint8_t cc=buffer.ccAt(i); // !=0 if after last starter | |
262 | if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) { | |
263 | // The starter combines with a mark that reorders before the current one. | |
264 | return FALSE; | |
729e4ab9 | 265 | } |
0f5d89e8 A |
266 | UChar32 c=buffer.charAt(i); |
267 | if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && | |
268 | norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) { | |
269 | // The starter combines with c into a composite replacement starter. | |
270 | starterNorm=norms.getNorm(starter); | |
271 | if(i>=lastStarterIndex && | |
272 | (starterNorm==nullptr || starterNorm->compositions==nullptr)) { | |
273 | return TRUE; // The composite does not combine further. | |
729e4ab9 | 274 | } |
0f5d89e8 A |
275 | // Keep prevCC because we "removed" the combining mark. |
276 | } else if(cc==0) { | |
277 | starterNorm=norms.getNorm(c); | |
278 | if(i==lastStarterIndex && | |
279 | (starterNorm==nullptr || starterNorm->compositions==nullptr)) { | |
280 | return TRUE; // The new starter does not combine forward. | |
281 | } | |
282 | prevCC=0; | |
729e4ab9 A |
283 | } else { |
284 | prevCC=cc; | |
4388f060 | 285 | } |
729e4ab9 | 286 | } |
0f5d89e8 A |
287 | if(prevCC==0) { |
288 | return FALSE; // forward-combining starter at the very end | |
729e4ab9 | 289 | } |
0f5d89e8 A |
290 | if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) { |
291 | // The starter combines with another mark. | |
292 | return FALSE; | |
729e4ab9 | 293 | } |
0f5d89e8 | 294 | return TRUE; |
729e4ab9 A |
295 | } |
296 | ||
0f5d89e8 A |
297 | UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const { |
298 | if(buffer.lastStarterIndex()<0) { | |
299 | return FALSE; // no starter | |
729e4ab9 | 300 | } |
0f5d89e8 A |
301 | const Norm *starterNorm=nullptr; |
302 | uint8_t prevCC=0; | |
303 | for(int32_t i=0; i<buffer.length(); ++i) { | |
304 | UChar32 c=buffer.charAt(i); | |
305 | uint8_t cc=buffer.ccAt(i); | |
306 | if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && | |
307 | norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) { | |
308 | return TRUE; // normal composite | |
309 | } else if(cc==0) { | |
310 | if(Hangul::isJamoL(c)) { | |
311 | if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) { | |
312 | return TRUE; // Hangul syllable | |
313 | } | |
314 | starterNorm=nullptr; | |
729e4ab9 | 315 | } else { |
0f5d89e8 | 316 | starterNorm=norms.getNorm(c); |
729e4ab9 | 317 | } |
729e4ab9 | 318 | } |
0f5d89e8 | 319 | prevCC=cc; |
729e4ab9 | 320 | } |
0f5d89e8 | 321 | return FALSE; |
729e4ab9 A |
322 | } |
323 | ||
0f5d89e8 A |
324 | void Normalizer2DataBuilder::postProcess(Norm &norm) { |
325 | // Prerequisites: Compositions are built, mappings are recursively decomposed. | |
326 | // Mappings are not yet in canonical order. | |
327 | // | |
328 | // This function works on a Norm struct. We do not know which code point(s) map(s) to it. | |
329 | // Therefore, we cannot compute algorithmic mapping deltas here. | |
330 | // Error conditions are checked, but printed later when we do know the offending code point. | |
331 | if(norm.hasMapping()) { | |
332 | if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) { | |
333 | norm.error="mapping longer than maximum of 31"; | |
334 | return; | |
729e4ab9 | 335 | } |
0f5d89e8 A |
336 | // Ensure canonical order. |
337 | BuilderReorderingBuffer buffer; | |
338 | if(norm.rawMapping!=nullptr) { | |
339 | norms.reorder(*norm.rawMapping, buffer); | |
340 | buffer.reset(); | |
729e4ab9 | 341 | } |
0f5d89e8 A |
342 | norms.reorder(*norm.mapping, buffer); |
343 | if(buffer.isEmpty()) { | |
344 | // A character that is deleted (maps to an empty string) must | |
345 | // get the worst-case lccc and tccc values because arbitrary | |
346 | // characters on both sides will become adjacent. | |
347 | norm.leadCC=1; | |
348 | norm.trailCC=0xff; | |
4388f060 | 349 | } else { |
0f5d89e8 A |
350 | norm.leadCC=buffer.ccAt(0); |
351 | norm.trailCC=buffer.ccAt(buffer.length()-1); | |
729e4ab9 | 352 | } |
0f5d89e8 A |
353 | |
354 | norm.hasCompBoundaryBefore= | |
355 | !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0)); | |
356 | norm.hasCompBoundaryAfter= | |
357 | norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType); | |
358 | ||
359 | if(norm.combinesBack) { | |
360 | norm.error="combines-back and decomposes, not possible in Unicode normalization"; | |
361 | } else if(norm.mappingType==Norm::ROUND_TRIP) { | |
362 | if(norm.compositions!=NULL) { | |
363 | norm.type=Norm::YES_NO_COMBINES_FWD; | |
364 | } else { | |
365 | norm.type=Norm::YES_NO_MAPPING_ONLY; | |
366 | } | |
367 | } else { // one-way mapping | |
368 | if(norm.compositions!=NULL) { | |
369 | norm.error="combines-forward and has a one-way mapping, " | |
370 | "not possible in Unicode normalization"; | |
371 | } else if(buffer.isEmpty()) { | |
372 | norm.type=Norm::NO_NO_EMPTY; | |
373 | } else if(!norm.hasCompBoundaryBefore) { | |
374 | norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC; | |
375 | } else if(mappingRecomposes(buffer)) { | |
376 | norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE; | |
377 | } else { | |
378 | // The mapping is comp-normalized. | |
379 | norm.type=Norm::NO_NO_COMP_YES; | |
729e4ab9 A |
380 | } |
381 | } | |
0f5d89e8 A |
382 | } else { // no mapping |
383 | norm.leadCC=norm.trailCC=norm.cc; | |
384 | ||
385 | norm.hasCompBoundaryBefore= | |
386 | norm.cc==0 && !norm.combinesBack; | |
387 | norm.hasCompBoundaryAfter= | |
388 | norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr; | |
389 | ||
390 | if(norm.combinesBack) { | |
391 | if(norm.compositions!=nullptr) { | |
392 | // Earlier code checked ccc=0. | |
393 | norm.type=Norm::MAYBE_YES_COMBINES_FWD; | |
729e4ab9 | 394 | } else { |
0f5d89e8 | 395 | norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc |
729e4ab9 | 396 | } |
0f5d89e8 A |
397 | } else if(norm.compositions!=nullptr) { |
398 | // Earlier code checked ccc=0. | |
399 | norm.type=Norm::YES_YES_COMBINES_FWD; | |
400 | } else if(norm.cc!=0) { | |
401 | norm.type=Norm::YES_YES_WITH_CC; | |
402 | } else { | |
403 | norm.type=Norm::INERT; | |
729e4ab9 A |
404 | } |
405 | } | |
406 | } | |
407 | ||
0f5d89e8 | 408 | class Norm16Writer : public Norms::Enumerator { |
729e4ab9 | 409 | public: |
0f5d89e8 A |
410 | Norm16Writer(Norms &n, Normalizer2DataBuilder &b) : Norms::Enumerator(n), builder(b) {} |
411 | void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE { | |
412 | builder.writeNorm16(start, end, norm); | |
729e4ab9 | 413 | } |
0f5d89e8 | 414 | Normalizer2DataBuilder &builder; |
729e4ab9 A |
415 | }; |
416 | ||
0f5d89e8 A |
417 | void Normalizer2DataBuilder::setSmallFCD(UChar32 c) { |
418 | UChar32 lead= c<=0xffff ? c : U16_LEAD(c); | |
419 | smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); | |
420 | } | |
421 | ||
422 | void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm) { | |
423 | if((norm.leadCC|norm.trailCC)!=0) { | |
424 | for(UChar32 c=start; c<=end; ++c) { | |
425 | setSmallFCD(c); | |
426 | } | |
427 | } | |
428 | ||
429 | int32_t norm16; | |
430 | switch(norm.type) { | |
431 | case Norm::INERT: | |
432 | norm16=Normalizer2Impl::INERT; | |
433 | break; | |
434 | case Norm::YES_YES_COMBINES_FWD: | |
435 | norm16=norm.offset*2; | |
436 | break; | |
437 | case Norm::YES_NO_COMBINES_FWD: | |
438 | norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2; | |
439 | break; | |
440 | case Norm::YES_NO_MAPPING_ONLY: | |
441 | norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2; | |
442 | break; | |
443 | case Norm::NO_NO_COMP_YES: | |
444 | norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2; | |
445 | break; | |
446 | case Norm::NO_NO_COMP_BOUNDARY_BEFORE: | |
447 | norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2; | |
448 | break; | |
449 | case Norm::NO_NO_COMP_NO_MAYBE_CC: | |
450 | norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2; | |
451 | break; | |
452 | case Norm::NO_NO_EMPTY: | |
453 | norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2; | |
454 | break; | |
455 | case Norm::NO_NO_DELTA: | |
456 | { | |
457 | // Positive offset from minNoNoDelta, shifted left for additional bits. | |
458 | int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT; | |
459 | if(norm.trailCC==0) { | |
460 | // DELTA_TCCC_0==0 | |
461 | } else if(norm.trailCC==1) { | |
462 | offset|=Normalizer2Impl::DELTA_TCCC_1; | |
463 | } else { | |
464 | offset|=Normalizer2Impl::DELTA_TCCC_GT_1; | |
729e4ab9 | 465 | } |
0f5d89e8 | 466 | norm16=getMinNoNoDelta()+offset; |
729e4ab9 | 467 | break; |
729e4ab9 | 468 | } |
0f5d89e8 A |
469 | case Norm::MAYBE_YES_COMBINES_FWD: |
470 | norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2; | |
471 | break; | |
472 | case Norm::MAYBE_YES_SIMPLE: | |
473 | norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255 | |
474 | break; | |
475 | case Norm::YES_YES_WITH_CC: | |
476 | U_ASSERT(norm.cc!=0); | |
477 | norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255 | |
478 | break; | |
479 | default: // Should not occur. | |
480 | exit(U_INTERNAL_PROGRAM_ERROR); | |
481 | } | |
482 | U_ASSERT((norm16&1)==0); | |
483 | if(norm.hasCompBoundaryAfter) { | |
484 | norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; | |
485 | } | |
486 | IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); | |
487 | utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode); | |
488 | ||
489 | // Set the minimum code points for real data lookups in the quick check loops. | |
490 | UBool isDecompNo= | |
491 | (Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) || | |
492 | norm.cc!=0; | |
493 | if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { | |
494 | indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; | |
495 | } | |
496 | UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES; | |
497 | if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { | |
498 | indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; | |
499 | } | |
500 | if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) { | |
501 | indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start; | |
729e4ab9 A |
502 | } |
503 | } | |
504 | ||
505 | void Normalizer2DataBuilder::setHangulData() { | |
506 | HangulIterator hi; | |
507 | const HangulIterator::Range *range; | |
508 | // Check that none of the Hangul/Jamo code points have data. | |
509 | while((range=hi.nextRange())!=NULL) { | |
0f5d89e8 A |
510 | for(UChar32 c=range->start; c<=range->end; ++c) { |
511 | if(utrie2_get32(norm16Trie, c)>Normalizer2Impl::INERT) { | |
729e4ab9 A |
512 | fprintf(stderr, |
513 | "gennorm2 error: " | |
514 | "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", | |
515 | (long)c); | |
516 | exit(U_INVALID_FORMAT_ERROR); | |
517 | } | |
518 | } | |
519 | } | |
520 | // Set data for algorithmic runtime handling. | |
521 | IcuToolErrorCode errorCode("gennorm2/setHangulData()"); | |
0f5d89e8 A |
522 | |
523 | // Jamo V/T are maybeYes | |
524 | if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { | |
525 | indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE; | |
729e4ab9 | 526 | } |
0f5d89e8 A |
527 | utrie2_setRange32(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END, |
528 | Normalizer2Impl::JAMO_L, TRUE, errorCode); | |
529 | utrie2_setRange32(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END, | |
530 | Normalizer2Impl::JAMO_VT, TRUE, errorCode); | |
531 | // JAMO_T_BASE+1: not U+11A7 | |
532 | utrie2_setRange32(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END, | |
533 | Normalizer2Impl::JAMO_VT, TRUE, errorCode); | |
534 | ||
535 | // Hangul LV encoded as minYesNo | |
536 | uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO]; | |
537 | // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER | |
538 | uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]| | |
539 | Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; | |
540 | if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { | |
541 | indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE; | |
542 | } | |
543 | // Set the first LV, then write all other Hangul syllables as LVT, | |
544 | // then overwrite the remaining LV. | |
545 | // The UTrie2 should be able to compact this into 7 32-item blocks | |
546 | // because JAMO_T_COUNT is 28 and the UTrie2 granularity is 4. | |
547 | // (7*32=8*28 smallest common multiple) | |
548 | utrie2_set32(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode); | |
549 | utrie2_setRange32(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END, | |
550 | lvt, TRUE, errorCode); | |
551 | UChar32 c=Hangul::HANGUL_BASE; | |
552 | while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) { | |
553 | utrie2_set32(norm16Trie, c, lv, errorCode); | |
554 | } | |
555 | errorCode.assertSuccess(); | |
729e4ab9 A |
556 | } |
557 | ||
0f5d89e8 A |
558 | namespace { |
559 | ||
560 | struct Norm16Summary { | |
561 | uint32_t maxNorm16; | |
562 | // ANDing values yields 0 bits where any value has a 0. | |
563 | // Used for worst-case HAS_COMP_BOUNDARY_AFTER. | |
564 | uint32_t andedNorm16; | |
565 | }; | |
566 | ||
567 | } // namespace | |
568 | ||
729e4ab9 A |
569 | U_CDECL_BEGIN |
570 | ||
571 | static UBool U_CALLCONV | |
572 | enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { | |
0f5d89e8 A |
573 | Norm16Summary *p=(Norm16Summary *)context; |
574 | if(value>p->maxNorm16) { | |
575 | p->maxNorm16=value; | |
729e4ab9 | 576 | } |
0f5d89e8 | 577 | p->andedNorm16&=value; |
729e4ab9 A |
578 | return TRUE; |
579 | } | |
580 | ||
581 | U_CDECL_END | |
582 | ||
583 | void Normalizer2DataBuilder::processData() { | |
584 | IcuToolErrorCode errorCode("gennorm2/processData()"); | |
0f5d89e8 | 585 | norm16Trie=utrie2_open(Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode); |
729e4ab9 A |
586 | errorCode.assertSuccess(); |
587 | ||
0f5d89e8 A |
588 | // Build composition lists before recursive decomposition, |
589 | // so that we still have the raw, pair-wise mappings. | |
590 | CompositionBuilder compBuilder(norms); | |
591 | norms.enumRanges(compBuilder); | |
729e4ab9 | 592 | |
0f5d89e8 A |
593 | // Recursively decompose all mappings. |
594 | Decomposer decomposer(norms); | |
729e4ab9 A |
595 | do { |
596 | decomposer.didDecompose=FALSE; | |
0f5d89e8 | 597 | norms.enumRanges(decomposer); |
729e4ab9 A |
598 | } while(decomposer.didDecompose); |
599 | ||
0f5d89e8 A |
600 | // Set the Norm::Type and other properties. |
601 | int32_t normsLength=norms.length(); | |
729e4ab9 | 602 | for(int32_t i=1; i<normsLength; ++i) { |
0f5d89e8 | 603 | postProcess(norms.getNormRefByIndex(i)); |
729e4ab9 A |
604 | } |
605 | ||
0f5d89e8 A |
606 | // Write the properties, mappings and composition lists to |
607 | // appropriate parts of the "extra data" array. | |
608 | ExtraData extra(norms, optimization==OPTIMIZE_FAST); | |
609 | norms.enumRanges(extra); | |
610 | ||
611 | extraData=extra.yesYesCompositions; | |
612 | indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2; | |
613 | extraData.append(extra.yesNoMappingsAndCompositions); | |
614 | indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2; | |
615 | extraData.append(extra.yesNoMappingsOnly); | |
616 | indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2; | |
617 | extraData.append(extra.noNoMappingsCompYes); | |
618 | indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2; | |
619 | extraData.append(extra.noNoMappingsCompBoundaryBefore); | |
620 | indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2; | |
621 | extraData.append(extra.noNoMappingsCompNoMaybeCC); | |
622 | indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2; | |
623 | extraData.append(extra.noNoMappingsEmpty); | |
624 | indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2; | |
625 | ||
626 | // Pad the maybeYesCompositions length to a multiple of 4, | |
627 | // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center. | |
628 | while(extra.maybeYesCompositions.length()&3) { | |
629 | extra.maybeYesCompositions.append((UChar)0); | |
630 | } | |
631 | extraData.insert(0, extra.maybeYesCompositions); | |
632 | indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= | |
633 | Normalizer2Impl::MIN_NORMAL_MAYBE_YES- | |
634 | extra.maybeYesCompositions.length()*2; | |
729e4ab9 | 635 | |
729e4ab9 A |
636 | // Pad to even length for 4-byte alignment of following data. |
637 | if(extraData.length()&1) { | |
638 | extraData.append((UChar)0); | |
639 | } | |
640 | ||
0f5d89e8 A |
641 | int32_t minNoNoDelta=getMinNoNoDelta(); |
642 | U_ASSERT((minNoNoDelta&7)==0); | |
729e4ab9 A |
643 | if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { |
644 | fprintf(stderr, | |
645 | "gennorm2 error: " | |
646 | "data structure overflow, too much mapping composition data\n"); | |
647 | exit(U_BUFFER_OVERFLOW_ERROR); | |
648 | } | |
649 | ||
0f5d89e8 A |
650 | // writeNorm16() and setHangulData() reduce these as needed. |
651 | indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; | |
652 | indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; | |
653 | indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000; | |
654 | ||
655 | // Map each code point to its norm16 value, | |
656 | // including the properties that fit directly, | |
657 | // and the offset to the "extra data" if necessary. | |
658 | Norm16Writer norm16Writer(norms, *this); | |
659 | norms.enumRanges(norm16Writer); | |
729e4ab9 A |
660 | |
661 | setHangulData(); | |
662 | ||
663 | // Look for the "worst" norm16 value of any supplementary code point | |
664 | // corresponding to a lead surrogate, and set it as that surrogate's value. | |
0f5d89e8 | 665 | // Enables UTF-16 quick check inner loops to look at only code units. |
729e4ab9 A |
666 | // |
667 | // We could be more sophisticated: | |
668 | // We could collect a bit set for whether there are values in the different | |
669 | // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) | |
670 | // and select the best value that only breaks the composition and/or decomposition | |
671 | // inner loops if necessary. | |
672 | // However, that seems like overkill for an optimization for supplementary characters. | |
673 | for(UChar lead=0xd800; lead<0xdc00; ++lead) { | |
0f5d89e8 A |
674 | uint32_t surrogateCPNorm16=utrie2_get32(norm16Trie, lead); |
675 | Norm16Summary summary={ surrogateCPNorm16, surrogateCPNorm16 }; | |
676 | utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &summary); | |
677 | uint32_t norm16=summary.maxNorm16; | |
678 | if(norm16>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && | |
679 | norm16>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]) { | |
729e4ab9 A |
680 | // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. |
681 | // Otherwise it might end up at something like JAMO_VT which stays in | |
682 | // the inner decomposition quick check loop. | |
0f5d89e8 | 683 | norm16=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; |
729e4ab9 | 684 | } |
0f5d89e8 A |
685 | norm16= |
686 | (norm16&~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)| | |
687 | (summary.andedNorm16&Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER); | |
688 | utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, norm16, errorCode); | |
729e4ab9 A |
689 | } |
690 | ||
691 | // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. | |
692 | // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) | |
693 | // which is harmless. | |
694 | // As a result, the minimum code points are always BMP code points. | |
695 | int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; | |
696 | if(minCP>=0x10000) { | |
697 | indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); | |
698 | } | |
699 | minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; | |
700 | if(minCP>=0x10000) { | |
701 | indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); | |
702 | } | |
0f5d89e8 A |
703 | minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP]; |
704 | if(minCP>=0x10000) { | |
705 | indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP); | |
706 | } | |
729e4ab9 | 707 | |
729e4ab9 | 708 | utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); |
b331163b | 709 | norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); |
729e4ab9 A |
710 | if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { |
711 | fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n", | |
712 | errorCode.errorName()); | |
713 | exit(errorCode.reset()); | |
714 | } | |
715 | errorCode.reset(); | |
729e4ab9 A |
716 | |
717 | int32_t offset=(int32_t)sizeof(indexes); | |
718 | indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; | |
719 | offset+=norm16TrieLength; | |
720 | indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; | |
4388f060 A |
721 | offset+=extraData.length()*2; |
722 | indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; | |
723 | offset+=sizeof(smallFCD); | |
724 | int32_t totalSize=offset; | |
725 | for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { | |
729e4ab9 A |
726 | indexes[i]=totalSize; |
727 | } | |
728 | ||
729 | if(beVerbose) { | |
730 | printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); | |
731 | printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); | |
4388f060 | 732 | printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); |
729e4ab9 A |
733 | printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); |
734 | printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); | |
735 | printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); | |
0f5d89e8 A |
736 | printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]); |
737 | printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); | |
4388f060 | 738 | printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); |
0f5d89e8 A |
739 | printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); |
740 | printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]); | |
741 | printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]); | |
742 | printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]); | |
729e4ab9 | 743 | printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); |
0f5d89e8 | 744 | printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta); |
729e4ab9 A |
745 | printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); |
746 | } | |
747 | ||
4388f060 A |
748 | UVersionInfo nullVersion={ 0, 0, 0, 0 }; |
749 | if(0==memcmp(nullVersion, unicodeVersion, 4)) { | |
750 | u_versionFromString(unicodeVersion, U_UNICODE_VERSION); | |
751 | } | |
729e4ab9 | 752 | memcpy(dataInfo.dataVersion, unicodeVersion, 4); |
b331163b A |
753 | } |
754 | ||
755 | void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { | |
756 | processData(); | |
757 | ||
758 | IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); | |
759 | LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); | |
760 | utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); | |
761 | errorCode.assertSuccess(); | |
762 | ||
729e4ab9 A |
763 | UNewDataMemory *pData= |
764 | udata_create(NULL, NULL, filename, &dataInfo, | |
765 | haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); | |
766 | if(errorCode.isFailure()) { | |
767 | fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", | |
768 | filename, errorCode.errorName()); | |
769 | exit(errorCode.reset()); | |
770 | } | |
771 | udata_writeBlock(pData, indexes, sizeof(indexes)); | |
772 | udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); | |
f3c0d7a5 | 773 | udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length()); |
4388f060 | 774 | udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); |
729e4ab9 A |
775 | int32_t writtenSize=udata_finish(pData, errorCode); |
776 | if(errorCode.isFailure()) { | |
777 | fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); | |
778 | exit(errorCode.reset()); | |
779 | } | |
b331163b | 780 | int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; |
729e4ab9 A |
781 | if(writtenSize!=totalSize) { |
782 | fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", | |
783 | (long)writtenSize, (long)totalSize); | |
784 | exit(U_INTERNAL_PROGRAM_ERROR); | |
785 | } | |
786 | } | |
787 | ||
b331163b A |
788 | void |
789 | Normalizer2DataBuilder::writeCSourceFile(const char *filename) { | |
790 | processData(); | |
791 | ||
792 | IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()"); | |
793 | const char *basename=findBasename(filename); | |
794 | CharString path(filename, (int32_t)(basename-filename), errorCode); | |
795 | CharString dataName(basename, errorCode); | |
796 | const char *extension=strrchr(basename, '.'); | |
797 | if(extension!=NULL) { | |
798 | dataName.truncate((int32_t)(extension-basename)); | |
799 | } | |
800 | errorCode.assertSuccess(); | |
801 | ||
802 | LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); | |
803 | utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); | |
804 | errorCode.assertSuccess(); | |
805 | ||
806 | FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2builder.cpp"); | |
807 | if(f==NULL) { | |
808 | fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n", | |
809 | filename); | |
810 | exit(U_FILE_ACCESS_ERROR); | |
811 | return; | |
812 | } | |
f3c0d7a5 | 813 | fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f); |
b331163b A |
814 | char line[100]; |
815 | sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data()); | |
816 | usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n"); | |
817 | sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data()); | |
818 | usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n"); | |
819 | sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", | |
820 | dataName.data()); | |
821 | usrc_writeArray(f, | |
822 | line, | |
823 | indexes, 32, Normalizer2Impl::IX_COUNT, | |
824 | "\n};\n\n"); | |
825 | sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data()); | |
826 | usrc_writeUTrie2Arrays(f, | |
827 | line, NULL, | |
828 | norm16Trie, | |
829 | "\n};\n\n"); | |
830 | sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data()); | |
831 | usrc_writeArray(f, | |
832 | line, | |
833 | extraData.getBuffer(), 16, extraData.length(), | |
834 | "\n};\n\n"); | |
835 | sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data()); | |
836 | usrc_writeArray(f, | |
837 | line, | |
838 | smallFCD, 8, sizeof(smallFCD), | |
839 | "\n};\n\n"); | |
b331163b A |
840 | sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data()); |
841 | char line2[100]; | |
842 | sprintf(line2, "%s_trieIndex", dataName.data()); | |
843 | usrc_writeUTrie2Struct(f, | |
844 | line, | |
845 | norm16Trie, line2, NULL, | |
846 | "};\n"); | |
f3c0d7a5 | 847 | fputs("\n#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f); |
b331163b A |
848 | fclose(f); |
849 | } | |
850 | ||
0f5d89e8 A |
851 | namespace { |
852 | ||
853 | bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) { | |
854 | if(s1 == nullptr) { | |
855 | return s2 == nullptr; | |
856 | } else if(s2 == nullptr) { | |
857 | return false; | |
858 | } else { | |
859 | return *s1 == *s2; | |
860 | } | |
861 | } | |
862 | ||
863 | const char *typeChars = "?-=>"; | |
864 | ||
865 | void writeMapping(FILE *f, const UnicodeString *m) { | |
866 | if(m != nullptr && !m->isEmpty()) { | |
867 | int32_t i = 0; | |
868 | UChar32 c = m->char32At(i); | |
869 | fprintf(f, "%04lX", (long)c); | |
870 | while((i += U16_LENGTH(c)) < m->length()) { | |
871 | c = m->char32At(i); | |
872 | fprintf(f, " %04lX", (long)c); | |
873 | } | |
874 | } | |
875 | fputs("\n", f); | |
876 | } | |
877 | ||
878 | } // namespace | |
879 | ||
880 | void | |
881 | Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const { | |
882 | // Do not processData() before writing the input-syntax data file. | |
883 | FILE *f = fopen(filename, "w"); | |
884 | if(f == nullptr) { | |
885 | fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n", | |
886 | filename); | |
887 | exit(U_FILE_ACCESS_ERROR); | |
888 | return; | |
889 | } | |
890 | ||
891 | if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 || | |
892 | unicodeVersion[2] != 0 || unicodeVersion[3] != 0) { | |
893 | char uv[U_MAX_VERSION_STRING_LENGTH]; | |
894 | u_versionToString(unicodeVersion, uv); | |
895 | fprintf(f, "* Unicode %s\n\n", uv); | |
896 | } | |
897 | ||
898 | UnicodeSetIterator ccIter(norms.ccSet); | |
899 | UChar32 start = U_SENTINEL; | |
900 | UChar32 end = U_SENTINEL; | |
901 | uint8_t prevCC = 0; | |
902 | bool done = false; | |
903 | bool didWrite = false; | |
904 | do { | |
905 | UChar32 c; | |
906 | uint8_t cc; | |
907 | if(ccIter.next() && !ccIter.isString()) { | |
908 | c = ccIter.getCodepoint(); | |
909 | cc = norms.getCC(c); | |
910 | } else { | |
911 | c = 0x110000; | |
912 | cc = 0; | |
913 | done = true; | |
914 | } | |
915 | if(cc == prevCC && c == (end + 1)) { | |
916 | end = c; | |
917 | } else { | |
918 | if(prevCC != 0) { | |
919 | if(start == end) { | |
920 | fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC); | |
921 | } else { | |
922 | fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC); | |
923 | } | |
924 | didWrite = true; | |
925 | } | |
926 | start = end = c; | |
927 | prevCC = cc; | |
928 | } | |
929 | } while(!done); | |
930 | if(didWrite) { | |
931 | fputs("\n", f); | |
932 | } | |
933 | ||
934 | UnicodeSetIterator mIter(norms.mappingSet); | |
935 | start = U_SENTINEL; | |
936 | end = U_SENTINEL; | |
937 | const UnicodeString *prevMapping = nullptr; | |
938 | Norm::MappingType prevType = Norm::NONE; | |
939 | done = false; | |
940 | do { | |
941 | UChar32 c; | |
942 | const Norm *norm; | |
943 | if(mIter.next() && !mIter.isString()) { | |
944 | c = mIter.getCodepoint(); | |
945 | norm = norms.getNorm(c); | |
946 | } else { | |
947 | c = 0x110000; | |
948 | norm = nullptr; | |
949 | done = true; | |
950 | } | |
951 | const UnicodeString *mapping; | |
952 | Norm::MappingType type; | |
953 | if(norm == nullptr) { | |
954 | mapping = nullptr; | |
955 | type = Norm::NONE; | |
956 | } else { | |
957 | type = norm->mappingType; | |
958 | if(type == Norm::NONE) { | |
959 | mapping = nullptr; | |
960 | } else { | |
961 | mapping = norm->mapping; | |
962 | } | |
963 | } | |
964 | if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) { | |
965 | end = c; | |
966 | } else { | |
967 | if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) { | |
968 | if(start == end) { | |
969 | fprintf(f, "%04lX%c", (long)start, typeChars[prevType]); | |
970 | } else { | |
971 | fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]); | |
972 | } | |
973 | writeMapping(f, prevMapping); | |
974 | } | |
975 | start = end = c; | |
976 | prevMapping = mapping; | |
977 | prevType = type; | |
978 | } | |
979 | } while(!done); | |
980 | ||
981 | fclose(f); | |
982 | } | |
983 | ||
984 | void | |
985 | Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1, | |
986 | const Normalizer2DataBuilder &b2, | |
987 | Normalizer2DataBuilder &diff) { | |
988 | // Compute diff = b1 - b2 | |
989 | // so that we should be able to get b1 = b2 + diff. | |
990 | if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) { | |
991 | memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH); | |
992 | } | |
993 | ||
994 | UnicodeSet ccSet(b1.norms.ccSet); | |
995 | ccSet.addAll(b2.norms.ccSet); | |
996 | UnicodeSetIterator ccIter(ccSet); | |
997 | while(ccIter.next() && !ccIter.isString()) { | |
998 | UChar32 c = ccIter.getCodepoint(); | |
999 | uint8_t cc1 = b1.norms.getCC(c); | |
1000 | uint8_t cc2 = b2.norms.getCC(c); | |
1001 | if(cc1 != cc2) { | |
1002 | diff.setCC(c, cc1); | |
1003 | } | |
1004 | } | |
1005 | ||
1006 | UnicodeSet mSet(b1.norms.mappingSet); | |
1007 | mSet.addAll(b2.norms.mappingSet); | |
1008 | UnicodeSetIterator mIter(mSet); | |
1009 | while(mIter.next() && !mIter.isString()) { | |
1010 | UChar32 c = mIter.getCodepoint(); | |
1011 | const Norm *norm1 = b1.norms.getNorm(c); | |
1012 | const Norm *norm2 = b2.norms.getNorm(c); | |
1013 | const UnicodeString *mapping1; | |
1014 | Norm::MappingType type1; | |
1015 | if(norm1 == nullptr || !norm1->hasMapping()) { | |
1016 | mapping1 = nullptr; | |
1017 | type1 = Norm::NONE; | |
1018 | } else { | |
1019 | mapping1 = norm1->mapping; | |
1020 | type1 = norm1->mappingType; | |
1021 | } | |
1022 | const UnicodeString *mapping2; | |
1023 | Norm::MappingType type2; | |
1024 | if(norm2 == nullptr || !norm2->hasMapping()) { | |
1025 | mapping2 = nullptr; | |
1026 | type2 = Norm::NONE; | |
1027 | } else { | |
1028 | mapping2 = norm2->mapping; | |
1029 | type2 = norm2->mappingType; | |
1030 | } | |
1031 | if(type1 == type2 && equalStrings(mapping1, mapping2)) { | |
1032 | // Nothing to do. | |
1033 | } else if(type1 == Norm::NONE) { | |
1034 | diff.removeMapping(c); | |
1035 | } else if(type1 == Norm::ROUND_TRIP) { | |
1036 | diff.setRoundTripMapping(c, *mapping1); | |
1037 | } else if(type1 == Norm::ONE_WAY) { | |
1038 | diff.setOneWayMapping(c, *mapping1); | |
1039 | } | |
1040 | } | |
1041 | } | |
1042 | ||
729e4ab9 A |
1043 | U_NAMESPACE_END |
1044 | ||
1045 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ | |
1046 | ||
1047 | /* | |
1048 | * Hey, Emacs, please set the following: | |
1049 | * | |
1050 | * Local Variables: | |
1051 | * indent-tabs-mode: nil | |
1052 | * End: | |
1053 | */ |