]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
729e4ab9 A |
3 | /* |
4 | ******************************************************************************* | |
5 | * | |
f3c0d7a5 | 6 | * Copyright (C) 2009-2016, International Business Machines |
729e4ab9 A |
7 | * Corporation and others. All Rights Reserved. |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: n2builder.cpp | |
f3c0d7a5 | 11 | * encoding: UTF-8 |
729e4ab9 A |
12 | * tab size: 8 (not used) |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2009nov25 | |
16 | * created by: Markus W. Scherer | |
17 | * | |
18 | * Builds Normalizer2 data and writes a binary .nrm file. | |
19 | * For the file format see source/common/normalizer2impl.h. | |
20 | */ | |
21 | ||
22 | #include "unicode/utypes.h" | |
23 | #include "n2builder.h" | |
24 | ||
25 | #include <stdio.h> | |
26 | #include <stdlib.h> | |
27 | #include <string.h> | |
729e4ab9 | 28 | #include <vector> |
729e4ab9 A |
29 | #include "unicode/errorcode.h" |
30 | #include "unicode/localpointer.h" | |
31 | #include "unicode/putil.h" | |
3d1f044b | 32 | #include "unicode/ucptrie.h" |
729e4ab9 | 33 | #include "unicode/udata.h" |
3d1f044b | 34 | #include "unicode/umutablecptrie.h" |
729e4ab9 A |
35 | #include "unicode/uniset.h" |
36 | #include "unicode/unistr.h" | |
0f5d89e8 | 37 | #include "unicode/usetiter.h" |
729e4ab9 | 38 | #include "unicode/ustring.h" |
b331163b | 39 | #include "charstr.h" |
0f5d89e8 | 40 | #include "extradata.h" |
729e4ab9 A |
41 | #include "hash.h" |
42 | #include "normalizer2impl.h" | |
0f5d89e8 | 43 | #include "norms.h" |
729e4ab9 A |
44 | #include "toolutil.h" |
45 | #include "unewdata.h" | |
729e4ab9 | 46 | #include "uvectr32.h" |
b331163b | 47 | #include "writesrc.h" |
729e4ab9 A |
48 | |
49 | #if !UCONFIG_NO_NORMALIZATION | |
50 | ||
51 | /* UDataInfo cf. udata.h */ | |
52 | static UDataInfo dataInfo={ | |
53 | sizeof(UDataInfo), | |
54 | 0, | |
55 | ||
56 | U_IS_BIG_ENDIAN, | |
57 | U_CHARSET_FAMILY, | |
58 | U_SIZEOF_UCHAR, | |
59 | 0, | |
60 | ||
61 | { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ | |
3d1f044b A |
62 | { 4, 0, 0, 0 }, /* formatVersion */ |
63 | { 11, 0, 0, 0 } /* dataVersion (Unicode version) */ | |
729e4ab9 A |
64 | }; |
65 | ||
66 | U_NAMESPACE_BEGIN | |
67 | ||
68 | class HangulIterator { | |
69 | public: | |
70 | struct Range { | |
0f5d89e8 | 71 | UChar32 start, end; |
729e4ab9 A |
72 | }; |
73 | ||
74 | HangulIterator() : rangeIndex(0) {} | |
75 | const Range *nextRange() { | |
b331163b | 76 | if(rangeIndex<UPRV_LENGTHOF(ranges)) { |
729e4ab9 A |
77 | return ranges+rangeIndex++; |
78 | } else { | |
79 | return NULL; | |
80 | } | |
81 | } | |
729e4ab9 A |
82 | private: |
83 | static const Range ranges[4]; | |
84 | int32_t rangeIndex; | |
85 | }; | |
86 | ||
87 | const HangulIterator::Range HangulIterator::ranges[4]={ | |
0f5d89e8 A |
88 | { Hangul::JAMO_L_BASE, Hangul::JAMO_L_END }, |
89 | { Hangul::JAMO_V_BASE, Hangul::JAMO_V_END }, | |
729e4ab9 | 90 | // JAMO_T_BASE+1: not U+11A7 |
0f5d89e8 A |
91 | { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END }, |
92 | { Hangul::HANGUL_BASE, Hangul::HANGUL_END }, | |
729e4ab9 A |
93 | }; |
94 | ||
729e4ab9 | 95 | Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : |
0f5d89e8 | 96 | norms(errorCode), |
b331163b | 97 | phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL), |
3d1f044b | 98 | norm16TrieBytes(nullptr), norm16TrieLength(0) { |
729e4ab9 | 99 | memset(unicodeVersion, 0, sizeof(unicodeVersion)); |
729e4ab9 | 100 | memset(indexes, 0, sizeof(indexes)); |
4388f060 | 101 | memset(smallFCD, 0, sizeof(smallFCD)); |
729e4ab9 A |
102 | } |
103 | ||
104 | Normalizer2DataBuilder::~Normalizer2DataBuilder() { | |
3d1f044b | 105 | delete[] norm16TrieBytes; |
729e4ab9 A |
106 | } |
107 | ||
108 | void | |
109 | Normalizer2DataBuilder::setUnicodeVersion(const char *v) { | |
4388f060 A |
110 | UVersionInfo nullVersion={ 0, 0, 0, 0 }; |
111 | UVersionInfo version; | |
112 | u_versionFromString(version, v); | |
113 | if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && | |
114 | 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) | |
115 | ) { | |
116 | char buffer[U_MAX_VERSION_STRING_LENGTH]; | |
117 | u_versionToString(unicodeVersion, buffer); | |
118 | fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", | |
119 | buffer, v); | |
120 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
121 | } | |
122 | memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); | |
729e4ab9 A |
123 | } |
124 | ||
729e4ab9 A |
125 | Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { |
126 | if(p!=NULL) { | |
127 | if(p->mappingType!=Norm::NONE) { | |
128 | if( overrideHandling==OVERRIDE_NONE || | |
129 | (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) | |
130 | ) { | |
131 | fprintf(stderr, | |
132 | "error in gennorm2 phase %d: " | |
133 | "not permitted to override mapping for U+%04lX from phase %d\n", | |
134 | (int)phase, (long)c, (int)p->mappingPhase); | |
135 | exit(U_INVALID_FORMAT_ERROR); | |
136 | } | |
137 | delete p->mapping; | |
138 | p->mapping=NULL; | |
139 | } | |
140 | p->mappingPhase=phase; | |
141 | } | |
142 | return p; | |
143 | } | |
144 | ||
145 | void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { | |
146 | overrideHandling=oh; | |
147 | ++phase; | |
148 | } | |
149 | ||
150 | void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { | |
0f5d89e8 A |
151 | norms.createNorm(c)->cc=cc; |
152 | norms.ccSet.add(c); | |
729e4ab9 A |
153 | } |
154 | ||
155 | static UBool isWellFormed(const UnicodeString &s) { | |
156 | UErrorCode errorCode=U_ZERO_ERROR; | |
f3c0d7a5 | 157 | u_strToUTF8(NULL, 0, NULL, toUCharPtr(s.getBuffer()), s.length(), &errorCode); |
729e4ab9 A |
158 | return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; |
159 | } | |
160 | ||
161 | void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { | |
162 | if(!isWellFormed(m)) { | |
163 | fprintf(stderr, | |
164 | "error in gennorm2 phase %d: " | |
165 | "illegal one-way mapping from U+%04lX to malformed string\n", | |
166 | (int)phase, (long)c); | |
167 | exit(U_INVALID_FORMAT_ERROR); | |
168 | } | |
0f5d89e8 | 169 | Norm *p=checkNormForMapping(norms.createNorm(c), c); |
729e4ab9 A |
170 | p->mapping=new UnicodeString(m); |
171 | p->mappingType=Norm::ONE_WAY; | |
172 | p->setMappingCP(); | |
0f5d89e8 | 173 | norms.mappingSet.add(c); |
729e4ab9 A |
174 | } |
175 | ||
176 | void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { | |
177 | if(U_IS_SURROGATE(c)) { | |
178 | fprintf(stderr, | |
179 | "error in gennorm2 phase %d: " | |
180 | "illegal round-trip mapping from surrogate code point U+%04lX\n", | |
181 | (int)phase, (long)c); | |
182 | exit(U_INVALID_FORMAT_ERROR); | |
183 | } | |
184 | if(!isWellFormed(m)) { | |
185 | fprintf(stderr, | |
186 | "error in gennorm2 phase %d: " | |
187 | "illegal round-trip mapping from U+%04lX to malformed string\n", | |
188 | (int)phase, (long)c); | |
189 | exit(U_INVALID_FORMAT_ERROR); | |
190 | } | |
f3c0d7a5 | 191 | int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length()); |
729e4ab9 A |
192 | if(numCP!=2) { |
193 | fprintf(stderr, | |
194 | "error in gennorm2 phase %d: " | |
195 | "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", | |
196 | (int)phase, (long)c, (int)numCP); | |
197 | exit(U_INVALID_FORMAT_ERROR); | |
198 | } | |
0f5d89e8 | 199 | Norm *p=checkNormForMapping(norms.createNorm(c), c); |
729e4ab9 A |
200 | p->mapping=new UnicodeString(m); |
201 | p->mappingType=Norm::ROUND_TRIP; | |
202 | p->mappingCP=U_SENTINEL; | |
0f5d89e8 | 203 | norms.mappingSet.add(c); |
729e4ab9 A |
204 | } |
205 | ||
206 | void Normalizer2DataBuilder::removeMapping(UChar32 c) { | |
0f5d89e8 A |
207 | // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data. |
208 | Norm *p=checkNormForMapping(norms.createNorm(c), c); | |
209 | p->mappingType=Norm::REMOVED; | |
210 | norms.mappingSet.add(c); | |
729e4ab9 A |
211 | } |
212 | ||
0f5d89e8 A |
213 | UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer, |
214 | Norm::MappingType mappingType) const { | |
729e4ab9 | 215 | if(buffer.isEmpty()) { |
0f5d89e8 | 216 | return FALSE; // Maps-to-empty-string is no boundary of any kind. |
729e4ab9 A |
217 | } |
218 | int32_t lastStarterIndex=buffer.lastStarterIndex(); | |
219 | if(lastStarterIndex<0) { | |
0f5d89e8 A |
220 | return FALSE; // no starter |
221 | } | |
222 | const int32_t lastIndex=buffer.length()-1; | |
223 | if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) { | |
224 | // One-way mapping where after the last starter is at least one combining mark | |
225 | // with a combining class greater than 1, | |
226 | // which means that another combining mark can reorder before it. | |
227 | // By contrast, in a round-trip mapping this does not prevent a boundary as long as | |
228 | // the starter or composite does not combine-forward with a following combining mark. | |
229 | return FALSE; | |
729e4ab9 A |
230 | } |
231 | UChar32 starter=buffer.charAt(lastStarterIndex); | |
0f5d89e8 A |
232 | if(lastStarterIndex==0 && norms.combinesBack(starter)) { |
233 | // The last starter is at the beginning of the mapping and combines backward. | |
234 | return FALSE; | |
235 | } | |
236 | if(Hangul::isJamoL(starter) || | |
237 | (Hangul::isJamoV(starter) && | |
238 | 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) { | |
729e4ab9 A |
239 | // A Jamo leading consonant or an LV pair combines-forward if it is at the end, |
240 | // otherwise it is blocked. | |
0f5d89e8 | 241 | return lastStarterIndex!=lastIndex; |
729e4ab9 | 242 | } |
4388f060 | 243 | // Note: There can be no Hangul syllable in the fully decomposed mapping. |
0f5d89e8 A |
244 | |
245 | // Multiple starters can combine into one. | |
246 | // Look for the first of the last sequence of starters, excluding Jamos. | |
247 | int32_t i=lastStarterIndex; | |
248 | UChar32 c; | |
249 | while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) { | |
250 | starter=c; | |
251 | --i; | |
252 | } | |
253 | // Compose as far as possible, and see if further compositions with | |
254 | // characters following this mapping are possible. | |
255 | const Norm *starterNorm=norms.getNorm(starter); | |
256 | if(i==lastStarterIndex && | |
257 | (starterNorm==nullptr || starterNorm->compositions==nullptr)) { | |
258 | return TRUE; // The last starter does not combine forward. | |
729e4ab9 | 259 | } |
729e4ab9 | 260 | uint8_t prevCC=0; |
0f5d89e8 A |
261 | while(++i<buffer.length()) { |
262 | uint8_t cc=buffer.ccAt(i); // !=0 if after last starter | |
263 | if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) { | |
264 | // The starter combines with a mark that reorders before the current one. | |
265 | return FALSE; | |
729e4ab9 | 266 | } |
0f5d89e8 A |
267 | UChar32 c=buffer.charAt(i); |
268 | if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && | |
269 | norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) { | |
270 | // The starter combines with c into a composite replacement starter. | |
271 | starterNorm=norms.getNorm(starter); | |
272 | if(i>=lastStarterIndex && | |
273 | (starterNorm==nullptr || starterNorm->compositions==nullptr)) { | |
274 | return TRUE; // The composite does not combine further. | |
729e4ab9 | 275 | } |
0f5d89e8 A |
276 | // Keep prevCC because we "removed" the combining mark. |
277 | } else if(cc==0) { | |
278 | starterNorm=norms.getNorm(c); | |
279 | if(i==lastStarterIndex && | |
280 | (starterNorm==nullptr || starterNorm->compositions==nullptr)) { | |
281 | return TRUE; // The new starter does not combine forward. | |
282 | } | |
283 | prevCC=0; | |
729e4ab9 A |
284 | } else { |
285 | prevCC=cc; | |
4388f060 | 286 | } |
729e4ab9 | 287 | } |
0f5d89e8 A |
288 | if(prevCC==0) { |
289 | return FALSE; // forward-combining starter at the very end | |
729e4ab9 | 290 | } |
0f5d89e8 A |
291 | if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) { |
292 | // The starter combines with another mark. | |
293 | return FALSE; | |
729e4ab9 | 294 | } |
0f5d89e8 | 295 | return TRUE; |
729e4ab9 A |
296 | } |
297 | ||
0f5d89e8 A |
298 | UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const { |
299 | if(buffer.lastStarterIndex()<0) { | |
300 | return FALSE; // no starter | |
729e4ab9 | 301 | } |
0f5d89e8 A |
302 | const Norm *starterNorm=nullptr; |
303 | uint8_t prevCC=0; | |
304 | for(int32_t i=0; i<buffer.length(); ++i) { | |
305 | UChar32 c=buffer.charAt(i); | |
306 | uint8_t cc=buffer.ccAt(i); | |
307 | if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && | |
308 | norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) { | |
309 | return TRUE; // normal composite | |
310 | } else if(cc==0) { | |
311 | if(Hangul::isJamoL(c)) { | |
312 | if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) { | |
313 | return TRUE; // Hangul syllable | |
314 | } | |
315 | starterNorm=nullptr; | |
729e4ab9 | 316 | } else { |
0f5d89e8 | 317 | starterNorm=norms.getNorm(c); |
729e4ab9 | 318 | } |
729e4ab9 | 319 | } |
0f5d89e8 | 320 | prevCC=cc; |
729e4ab9 | 321 | } |
0f5d89e8 | 322 | return FALSE; |
729e4ab9 A |
323 | } |
324 | ||
0f5d89e8 A |
325 | void Normalizer2DataBuilder::postProcess(Norm &norm) { |
326 | // Prerequisites: Compositions are built, mappings are recursively decomposed. | |
327 | // Mappings are not yet in canonical order. | |
328 | // | |
329 | // This function works on a Norm struct. We do not know which code point(s) map(s) to it. | |
330 | // Therefore, we cannot compute algorithmic mapping deltas here. | |
331 | // Error conditions are checked, but printed later when we do know the offending code point. | |
332 | if(norm.hasMapping()) { | |
333 | if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) { | |
334 | norm.error="mapping longer than maximum of 31"; | |
335 | return; | |
729e4ab9 | 336 | } |
0f5d89e8 A |
337 | // Ensure canonical order. |
338 | BuilderReorderingBuffer buffer; | |
339 | if(norm.rawMapping!=nullptr) { | |
340 | norms.reorder(*norm.rawMapping, buffer); | |
341 | buffer.reset(); | |
729e4ab9 | 342 | } |
0f5d89e8 A |
343 | norms.reorder(*norm.mapping, buffer); |
344 | if(buffer.isEmpty()) { | |
345 | // A character that is deleted (maps to an empty string) must | |
346 | // get the worst-case lccc and tccc values because arbitrary | |
347 | // characters on both sides will become adjacent. | |
348 | norm.leadCC=1; | |
349 | norm.trailCC=0xff; | |
4388f060 | 350 | } else { |
0f5d89e8 A |
351 | norm.leadCC=buffer.ccAt(0); |
352 | norm.trailCC=buffer.ccAt(buffer.length()-1); | |
729e4ab9 | 353 | } |
0f5d89e8 A |
354 | |
355 | norm.hasCompBoundaryBefore= | |
356 | !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0)); | |
357 | norm.hasCompBoundaryAfter= | |
358 | norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType); | |
359 | ||
360 | if(norm.combinesBack) { | |
361 | norm.error="combines-back and decomposes, not possible in Unicode normalization"; | |
362 | } else if(norm.mappingType==Norm::ROUND_TRIP) { | |
363 | if(norm.compositions!=NULL) { | |
364 | norm.type=Norm::YES_NO_COMBINES_FWD; | |
365 | } else { | |
366 | norm.type=Norm::YES_NO_MAPPING_ONLY; | |
367 | } | |
368 | } else { // one-way mapping | |
369 | if(norm.compositions!=NULL) { | |
370 | norm.error="combines-forward and has a one-way mapping, " | |
371 | "not possible in Unicode normalization"; | |
372 | } else if(buffer.isEmpty()) { | |
373 | norm.type=Norm::NO_NO_EMPTY; | |
374 | } else if(!norm.hasCompBoundaryBefore) { | |
375 | norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC; | |
376 | } else if(mappingRecomposes(buffer)) { | |
377 | norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE; | |
378 | } else { | |
379 | // The mapping is comp-normalized. | |
380 | norm.type=Norm::NO_NO_COMP_YES; | |
729e4ab9 A |
381 | } |
382 | } | |
0f5d89e8 A |
383 | } else { // no mapping |
384 | norm.leadCC=norm.trailCC=norm.cc; | |
385 | ||
386 | norm.hasCompBoundaryBefore= | |
387 | norm.cc==0 && !norm.combinesBack; | |
388 | norm.hasCompBoundaryAfter= | |
389 | norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr; | |
390 | ||
391 | if(norm.combinesBack) { | |
392 | if(norm.compositions!=nullptr) { | |
393 | // Earlier code checked ccc=0. | |
394 | norm.type=Norm::MAYBE_YES_COMBINES_FWD; | |
729e4ab9 | 395 | } else { |
0f5d89e8 | 396 | norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc |
729e4ab9 | 397 | } |
0f5d89e8 A |
398 | } else if(norm.compositions!=nullptr) { |
399 | // Earlier code checked ccc=0. | |
400 | norm.type=Norm::YES_YES_COMBINES_FWD; | |
401 | } else if(norm.cc!=0) { | |
402 | norm.type=Norm::YES_YES_WITH_CC; | |
403 | } else { | |
404 | norm.type=Norm::INERT; | |
729e4ab9 A |
405 | } |
406 | } | |
407 | } | |
408 | ||
0f5d89e8 | 409 | class Norm16Writer : public Norms::Enumerator { |
729e4ab9 | 410 | public: |
3d1f044b A |
411 | Norm16Writer(UMutableCPTrie *trie, Norms &n, Normalizer2DataBuilder &b) : |
412 | Norms::Enumerator(n), builder(b), norm16Trie(trie) {} | |
0f5d89e8 | 413 | void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE { |
3d1f044b | 414 | builder.writeNorm16(norm16Trie, start, end, norm); |
729e4ab9 | 415 | } |
0f5d89e8 | 416 | Normalizer2DataBuilder &builder; |
3d1f044b | 417 | UMutableCPTrie *norm16Trie; |
729e4ab9 A |
418 | }; |
419 | ||
0f5d89e8 A |
420 | void Normalizer2DataBuilder::setSmallFCD(UChar32 c) { |
421 | UChar32 lead= c<=0xffff ? c : U16_LEAD(c); | |
422 | smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); | |
423 | } | |
424 | ||
3d1f044b | 425 | void Normalizer2DataBuilder::writeNorm16(UMutableCPTrie *norm16Trie, UChar32 start, UChar32 end, Norm &norm) { |
0f5d89e8 A |
426 | if((norm.leadCC|norm.trailCC)!=0) { |
427 | for(UChar32 c=start; c<=end; ++c) { | |
428 | setSmallFCD(c); | |
429 | } | |
430 | } | |
431 | ||
432 | int32_t norm16; | |
433 | switch(norm.type) { | |
434 | case Norm::INERT: | |
435 | norm16=Normalizer2Impl::INERT; | |
436 | break; | |
437 | case Norm::YES_YES_COMBINES_FWD: | |
438 | norm16=norm.offset*2; | |
439 | break; | |
440 | case Norm::YES_NO_COMBINES_FWD: | |
441 | norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2; | |
442 | break; | |
443 | case Norm::YES_NO_MAPPING_ONLY: | |
444 | norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2; | |
445 | break; | |
446 | case Norm::NO_NO_COMP_YES: | |
447 | norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2; | |
448 | break; | |
449 | case Norm::NO_NO_COMP_BOUNDARY_BEFORE: | |
450 | norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2; | |
451 | break; | |
452 | case Norm::NO_NO_COMP_NO_MAYBE_CC: | |
453 | norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2; | |
454 | break; | |
455 | case Norm::NO_NO_EMPTY: | |
456 | norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2; | |
457 | break; | |
458 | case Norm::NO_NO_DELTA: | |
459 | { | |
460 | // Positive offset from minNoNoDelta, shifted left for additional bits. | |
461 | int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT; | |
462 | if(norm.trailCC==0) { | |
463 | // DELTA_TCCC_0==0 | |
464 | } else if(norm.trailCC==1) { | |
465 | offset|=Normalizer2Impl::DELTA_TCCC_1; | |
466 | } else { | |
467 | offset|=Normalizer2Impl::DELTA_TCCC_GT_1; | |
729e4ab9 | 468 | } |
0f5d89e8 | 469 | norm16=getMinNoNoDelta()+offset; |
729e4ab9 | 470 | break; |
729e4ab9 | 471 | } |
0f5d89e8 A |
472 | case Norm::MAYBE_YES_COMBINES_FWD: |
473 | norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2; | |
474 | break; | |
475 | case Norm::MAYBE_YES_SIMPLE: | |
476 | norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255 | |
477 | break; | |
478 | case Norm::YES_YES_WITH_CC: | |
479 | U_ASSERT(norm.cc!=0); | |
480 | norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255 | |
481 | break; | |
482 | default: // Should not occur. | |
483 | exit(U_INTERNAL_PROGRAM_ERROR); | |
484 | } | |
485 | U_ASSERT((norm16&1)==0); | |
486 | if(norm.hasCompBoundaryAfter) { | |
487 | norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; | |
488 | } | |
489 | IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); | |
3d1f044b | 490 | umutablecptrie_setRange(norm16Trie, start, end, (uint32_t)norm16, errorCode); |
0f5d89e8 A |
491 | |
492 | // Set the minimum code points for real data lookups in the quick check loops. | |
493 | UBool isDecompNo= | |
494 | (Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) || | |
495 | norm.cc!=0; | |
496 | if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { | |
497 | indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; | |
498 | } | |
499 | UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES; | |
500 | if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { | |
501 | indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; | |
502 | } | |
503 | if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) { | |
504 | indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start; | |
729e4ab9 A |
505 | } |
506 | } | |
507 | ||
3d1f044b | 508 | void Normalizer2DataBuilder::setHangulData(UMutableCPTrie *norm16Trie) { |
729e4ab9 A |
509 | HangulIterator hi; |
510 | const HangulIterator::Range *range; | |
511 | // Check that none of the Hangul/Jamo code points have data. | |
512 | while((range=hi.nextRange())!=NULL) { | |
0f5d89e8 | 513 | for(UChar32 c=range->start; c<=range->end; ++c) { |
3d1f044b | 514 | if(umutablecptrie_get(norm16Trie, c)>Normalizer2Impl::INERT) { |
729e4ab9 A |
515 | fprintf(stderr, |
516 | "gennorm2 error: " | |
517 | "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", | |
518 | (long)c); | |
519 | exit(U_INVALID_FORMAT_ERROR); | |
520 | } | |
521 | } | |
522 | } | |
523 | // Set data for algorithmic runtime handling. | |
524 | IcuToolErrorCode errorCode("gennorm2/setHangulData()"); | |
0f5d89e8 A |
525 | |
526 | // Jamo V/T are maybeYes | |
527 | if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { | |
528 | indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE; | |
729e4ab9 | 529 | } |
3d1f044b A |
530 | umutablecptrie_setRange(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END, |
531 | Normalizer2Impl::JAMO_L, errorCode); | |
532 | umutablecptrie_setRange(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END, | |
533 | Normalizer2Impl::JAMO_VT, errorCode); | |
0f5d89e8 | 534 | // JAMO_T_BASE+1: not U+11A7 |
3d1f044b A |
535 | umutablecptrie_setRange(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END, |
536 | Normalizer2Impl::JAMO_VT, errorCode); | |
0f5d89e8 A |
537 | |
538 | // Hangul LV encoded as minYesNo | |
539 | uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO]; | |
540 | // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER | |
541 | uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]| | |
542 | Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; | |
543 | if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { | |
544 | indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE; | |
545 | } | |
546 | // Set the first LV, then write all other Hangul syllables as LVT, | |
547 | // then overwrite the remaining LV. | |
3d1f044b A |
548 | umutablecptrie_set(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode); |
549 | umutablecptrie_setRange(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END, lvt, errorCode); | |
0f5d89e8 A |
550 | UChar32 c=Hangul::HANGUL_BASE; |
551 | while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) { | |
3d1f044b | 552 | umutablecptrie_set(norm16Trie, c, lv, errorCode); |
0f5d89e8 A |
553 | } |
554 | errorCode.assertSuccess(); | |
729e4ab9 A |
555 | } |
556 | ||
3d1f044b | 557 | LocalUCPTriePointer Normalizer2DataBuilder::processData() { |
0f5d89e8 A |
558 | // Build composition lists before recursive decomposition, |
559 | // so that we still have the raw, pair-wise mappings. | |
560 | CompositionBuilder compBuilder(norms); | |
561 | norms.enumRanges(compBuilder); | |
729e4ab9 | 562 | |
0f5d89e8 A |
563 | // Recursively decompose all mappings. |
564 | Decomposer decomposer(norms); | |
729e4ab9 A |
565 | do { |
566 | decomposer.didDecompose=FALSE; | |
0f5d89e8 | 567 | norms.enumRanges(decomposer); |
729e4ab9 A |
568 | } while(decomposer.didDecompose); |
569 | ||
0f5d89e8 A |
570 | // Set the Norm::Type and other properties. |
571 | int32_t normsLength=norms.length(); | |
729e4ab9 | 572 | for(int32_t i=1; i<normsLength; ++i) { |
0f5d89e8 | 573 | postProcess(norms.getNormRefByIndex(i)); |
729e4ab9 A |
574 | } |
575 | ||
0f5d89e8 A |
576 | // Write the properties, mappings and composition lists to |
577 | // appropriate parts of the "extra data" array. | |
578 | ExtraData extra(norms, optimization==OPTIMIZE_FAST); | |
579 | norms.enumRanges(extra); | |
580 | ||
581 | extraData=extra.yesYesCompositions; | |
582 | indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2; | |
583 | extraData.append(extra.yesNoMappingsAndCompositions); | |
584 | indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2; | |
585 | extraData.append(extra.yesNoMappingsOnly); | |
586 | indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2; | |
587 | extraData.append(extra.noNoMappingsCompYes); | |
588 | indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2; | |
589 | extraData.append(extra.noNoMappingsCompBoundaryBefore); | |
590 | indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2; | |
591 | extraData.append(extra.noNoMappingsCompNoMaybeCC); | |
592 | indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2; | |
593 | extraData.append(extra.noNoMappingsEmpty); | |
594 | indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2; | |
595 | ||
596 | // Pad the maybeYesCompositions length to a multiple of 4, | |
597 | // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center. | |
598 | while(extra.maybeYesCompositions.length()&3) { | |
599 | extra.maybeYesCompositions.append((UChar)0); | |
600 | } | |
601 | extraData.insert(0, extra.maybeYesCompositions); | |
602 | indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= | |
603 | Normalizer2Impl::MIN_NORMAL_MAYBE_YES- | |
604 | extra.maybeYesCompositions.length()*2; | |
729e4ab9 | 605 | |
729e4ab9 A |
606 | // Pad to even length for 4-byte alignment of following data. |
607 | if(extraData.length()&1) { | |
608 | extraData.append((UChar)0); | |
609 | } | |
610 | ||
0f5d89e8 A |
611 | int32_t minNoNoDelta=getMinNoNoDelta(); |
612 | U_ASSERT((minNoNoDelta&7)==0); | |
729e4ab9 A |
613 | if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { |
614 | fprintf(stderr, | |
615 | "gennorm2 error: " | |
616 | "data structure overflow, too much mapping composition data\n"); | |
617 | exit(U_BUFFER_OVERFLOW_ERROR); | |
618 | } | |
619 | ||
0f5d89e8 A |
620 | // writeNorm16() and setHangulData() reduce these as needed. |
621 | indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; | |
622 | indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; | |
623 | indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000; | |
624 | ||
3d1f044b A |
625 | IcuToolErrorCode errorCode("gennorm2/processData()"); |
626 | UMutableCPTrie *norm16Trie = umutablecptrie_open( | |
627 | Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode); | |
628 | errorCode.assertSuccess(); | |
629 | ||
0f5d89e8 A |
630 | // Map each code point to its norm16 value, |
631 | // including the properties that fit directly, | |
632 | // and the offset to the "extra data" if necessary. | |
3d1f044b | 633 | Norm16Writer norm16Writer(norm16Trie, norms, *this); |
0f5d89e8 | 634 | norms.enumRanges(norm16Writer); |
3d1f044b | 635 | // TODO: iterate via getRange() instead of callback? |
729e4ab9 | 636 | |
3d1f044b | 637 | setHangulData(norm16Trie); |
729e4ab9 A |
638 | |
639 | // Look for the "worst" norm16 value of any supplementary code point | |
640 | // corresponding to a lead surrogate, and set it as that surrogate's value. | |
0f5d89e8 | 641 | // Enables UTF-16 quick check inner loops to look at only code units. |
729e4ab9 A |
642 | // |
643 | // We could be more sophisticated: | |
644 | // We could collect a bit set for whether there are values in the different | |
645 | // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) | |
646 | // and select the best value that only breaks the composition and/or decomposition | |
647 | // inner loops if necessary. | |
648 | // However, that seems like overkill for an optimization for supplementary characters. | |
3d1f044b A |
649 | // |
650 | // First check that surrogate code *points* are inert. | |
651 | // The parser should have rejected values/mappings for them. | |
652 | uint32_t value; | |
653 | UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0, | |
654 | nullptr, nullptr, &value); | |
655 | if (value != Normalizer2Impl::INERT || end < 0xdfff) { | |
656 | fprintf(stderr, | |
657 | "gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n", | |
658 | (int)end, (long)value); | |
659 | exit(U_INTERNAL_PROGRAM_ERROR); | |
660 | } | |
661 | uint32_t maxNorm16 = 0; | |
662 | // ANDing values yields 0 bits where any value has a 0. | |
663 | // Used for worst-case HAS_COMP_BOUNDARY_AFTER. | |
664 | uint32_t andedNorm16 = 0; | |
665 | end = 0; | |
666 | for (UChar32 start = 0x10000;;) { | |
667 | if (start > end) { | |
668 | end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0, | |
669 | nullptr, nullptr, &value); | |
670 | if (end < 0) { break; } | |
671 | } | |
672 | if ((start & 0x3ff) == 0) { | |
673 | // Data for a new lead surrogate. | |
674 | maxNorm16 = andedNorm16 = value; | |
675 | } else { | |
676 | if (value > maxNorm16) { | |
677 | maxNorm16 = value; | |
678 | } | |
679 | andedNorm16 &= value; | |
680 | } | |
681 | // Intersect each range with the code points for one lead surrogate. | |
682 | UChar32 leadEnd = start | 0x3ff; | |
683 | if (leadEnd <= end) { | |
684 | // End of the supplementary block for a lead surrogate. | |
685 | if (maxNorm16 >= (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]) { | |
686 | // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. | |
687 | // Otherwise it might end up at something like JAMO_VT which stays in | |
688 | // the inner decomposition quick check loop. | |
689 | maxNorm16 = (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]; | |
690 | } | |
691 | maxNorm16 = | |
692 | (maxNorm16 & ~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)| | |
693 | (andedNorm16 & Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER); | |
694 | if (maxNorm16 != Normalizer2Impl::INERT) { | |
695 | umutablecptrie_set(norm16Trie, U16_LEAD(start), maxNorm16, errorCode); | |
696 | } | |
697 | if (value == Normalizer2Impl::INERT) { | |
698 | // Potentially skip inert supplementary blocks for several lead surrogates. | |
699 | start = (end + 1) & ~0x3ff; | |
700 | } else { | |
701 | start = leadEnd + 1; | |
702 | } | |
703 | } else { | |
704 | start = end + 1; | |
729e4ab9 | 705 | } |
729e4ab9 A |
706 | } |
707 | ||
708 | // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. | |
709 | // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) | |
710 | // which is harmless. | |
711 | // As a result, the minimum code points are always BMP code points. | |
712 | int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; | |
713 | if(minCP>=0x10000) { | |
714 | indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); | |
715 | } | |
716 | minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; | |
717 | if(minCP>=0x10000) { | |
718 | indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); | |
719 | } | |
0f5d89e8 A |
720 | minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP]; |
721 | if(minCP>=0x10000) { | |
722 | indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP); | |
723 | } | |
729e4ab9 | 724 | |
3d1f044b A |
725 | LocalUCPTriePointer builtTrie( |
726 | umutablecptrie_buildImmutable(norm16Trie, UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_16, errorCode)); | |
727 | norm16TrieLength=ucptrie_toBinary(builtTrie.getAlias(), nullptr, 0, errorCode); | |
729e4ab9 | 728 | if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { |
3d1f044b | 729 | fprintf(stderr, "gennorm2 error: unable to build/serialize the normalization trie - %s\n", |
729e4ab9 A |
730 | errorCode.errorName()); |
731 | exit(errorCode.reset()); | |
732 | } | |
3d1f044b | 733 | umutablecptrie_close(norm16Trie); |
729e4ab9 | 734 | errorCode.reset(); |
3d1f044b A |
735 | norm16TrieBytes=new uint8_t[norm16TrieLength]; |
736 | ucptrie_toBinary(builtTrie.getAlias(), norm16TrieBytes, norm16TrieLength, errorCode); | |
737 | errorCode.assertSuccess(); | |
729e4ab9 A |
738 | |
739 | int32_t offset=(int32_t)sizeof(indexes); | |
740 | indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; | |
741 | offset+=norm16TrieLength; | |
742 | indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; | |
4388f060 A |
743 | offset+=extraData.length()*2; |
744 | indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; | |
745 | offset+=sizeof(smallFCD); | |
746 | int32_t totalSize=offset; | |
747 | for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { | |
729e4ab9 A |
748 | indexes[i]=totalSize; |
749 | } | |
750 | ||
751 | if(beVerbose) { | |
752 | printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); | |
753 | printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); | |
4388f060 | 754 | printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); |
729e4ab9 A |
755 | printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); |
756 | printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); | |
757 | printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); | |
0f5d89e8 A |
758 | printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]); |
759 | printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); | |
4388f060 | 760 | printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); |
0f5d89e8 A |
761 | printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); |
762 | printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]); | |
763 | printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]); | |
764 | printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]); | |
729e4ab9 | 765 | printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); |
0f5d89e8 | 766 | printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta); |
729e4ab9 A |
767 | printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); |
768 | } | |
769 | ||
4388f060 A |
770 | UVersionInfo nullVersion={ 0, 0, 0, 0 }; |
771 | if(0==memcmp(nullVersion, unicodeVersion, 4)) { | |
772 | u_versionFromString(unicodeVersion, U_UNICODE_VERSION); | |
773 | } | |
729e4ab9 | 774 | memcpy(dataInfo.dataVersion, unicodeVersion, 4); |
3d1f044b | 775 | return builtTrie; |
b331163b A |
776 | } |
777 | ||
778 | void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { | |
779 | processData(); | |
780 | ||
781 | IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); | |
729e4ab9 A |
782 | UNewDataMemory *pData= |
783 | udata_create(NULL, NULL, filename, &dataInfo, | |
784 | haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); | |
785 | if(errorCode.isFailure()) { | |
786 | fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", | |
787 | filename, errorCode.errorName()); | |
788 | exit(errorCode.reset()); | |
789 | } | |
790 | udata_writeBlock(pData, indexes, sizeof(indexes)); | |
3d1f044b | 791 | udata_writeBlock(pData, norm16TrieBytes, norm16TrieLength); |
f3c0d7a5 | 792 | udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length()); |
4388f060 | 793 | udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); |
729e4ab9 A |
794 | int32_t writtenSize=udata_finish(pData, errorCode); |
795 | if(errorCode.isFailure()) { | |
796 | fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); | |
797 | exit(errorCode.reset()); | |
798 | } | |
b331163b | 799 | int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; |
729e4ab9 A |
800 | if(writtenSize!=totalSize) { |
801 | fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", | |
802 | (long)writtenSize, (long)totalSize); | |
803 | exit(U_INTERNAL_PROGRAM_ERROR); | |
804 | } | |
805 | } | |
806 | ||
b331163b A |
807 | void |
808 | Normalizer2DataBuilder::writeCSourceFile(const char *filename) { | |
3d1f044b | 809 | LocalUCPTriePointer norm16Trie = processData(); |
b331163b A |
810 | |
811 | IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()"); | |
812 | const char *basename=findBasename(filename); | |
813 | CharString path(filename, (int32_t)(basename-filename), errorCode); | |
814 | CharString dataName(basename, errorCode); | |
815 | const char *extension=strrchr(basename, '.'); | |
816 | if(extension!=NULL) { | |
817 | dataName.truncate((int32_t)(extension-basename)); | |
818 | } | |
3d1f044b | 819 | const char *name=dataName.data(); |
b331163b A |
820 | errorCode.assertSuccess(); |
821 | ||
3d1f044b | 822 | FILE *f=usrc_create(path.data(), basename, 2016, "icu/source/tools/gennorm2/n2builder.cpp"); |
b331163b A |
823 | if(f==NULL) { |
824 | fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n", | |
825 | filename); | |
826 | exit(U_FILE_ACCESS_ERROR); | |
b331163b | 827 | } |
f3c0d7a5 | 828 | fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f); |
3d1f044b | 829 | |
b331163b | 830 | char line[100]; |
3d1f044b | 831 | sprintf(line, "static const UVersionInfo %s_formatVersion={", name); |
b331163b | 832 | usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n"); |
3d1f044b | 833 | sprintf(line, "static const UVersionInfo %s_dataVersion={", name); |
b331163b | 834 | usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n"); |
3d1f044b A |
835 | sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", name); |
836 | usrc_writeArray(f, line, indexes, 32, Normalizer2Impl::IX_COUNT, "\n};\n\n"); | |
837 | ||
838 | usrc_writeUCPTrie(f, name, norm16Trie.getAlias()); | |
839 | ||
840 | sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", name); | |
841 | usrc_writeArray(f, line, extraData.getBuffer(), 16, extraData.length(), "\n};\n\n"); | |
842 | sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", name); | |
843 | usrc_writeArray(f, line, smallFCD, 8, sizeof(smallFCD), "\n};\n\n"); | |
844 | ||
845 | fputs("#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f); | |
b331163b A |
846 | fclose(f); |
847 | } | |
848 | ||
0f5d89e8 A |
849 | namespace { |
850 | ||
851 | bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) { | |
852 | if(s1 == nullptr) { | |
853 | return s2 == nullptr; | |
854 | } else if(s2 == nullptr) { | |
855 | return false; | |
856 | } else { | |
857 | return *s1 == *s2; | |
858 | } | |
859 | } | |
860 | ||
861 | const char *typeChars = "?-=>"; | |
862 | ||
863 | void writeMapping(FILE *f, const UnicodeString *m) { | |
864 | if(m != nullptr && !m->isEmpty()) { | |
865 | int32_t i = 0; | |
866 | UChar32 c = m->char32At(i); | |
867 | fprintf(f, "%04lX", (long)c); | |
868 | while((i += U16_LENGTH(c)) < m->length()) { | |
869 | c = m->char32At(i); | |
870 | fprintf(f, " %04lX", (long)c); | |
871 | } | |
872 | } | |
873 | fputs("\n", f); | |
874 | } | |
875 | ||
876 | } // namespace | |
877 | ||
878 | void | |
879 | Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const { | |
880 | // Do not processData() before writing the input-syntax data file. | |
881 | FILE *f = fopen(filename, "w"); | |
882 | if(f == nullptr) { | |
883 | fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n", | |
884 | filename); | |
885 | exit(U_FILE_ACCESS_ERROR); | |
886 | return; | |
887 | } | |
888 | ||
889 | if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 || | |
890 | unicodeVersion[2] != 0 || unicodeVersion[3] != 0) { | |
891 | char uv[U_MAX_VERSION_STRING_LENGTH]; | |
892 | u_versionToString(unicodeVersion, uv); | |
893 | fprintf(f, "* Unicode %s\n\n", uv); | |
894 | } | |
895 | ||
896 | UnicodeSetIterator ccIter(norms.ccSet); | |
897 | UChar32 start = U_SENTINEL; | |
898 | UChar32 end = U_SENTINEL; | |
899 | uint8_t prevCC = 0; | |
900 | bool done = false; | |
901 | bool didWrite = false; | |
902 | do { | |
903 | UChar32 c; | |
904 | uint8_t cc; | |
905 | if(ccIter.next() && !ccIter.isString()) { | |
906 | c = ccIter.getCodepoint(); | |
907 | cc = norms.getCC(c); | |
908 | } else { | |
909 | c = 0x110000; | |
910 | cc = 0; | |
911 | done = true; | |
912 | } | |
913 | if(cc == prevCC && c == (end + 1)) { | |
914 | end = c; | |
915 | } else { | |
916 | if(prevCC != 0) { | |
917 | if(start == end) { | |
918 | fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC); | |
919 | } else { | |
920 | fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC); | |
921 | } | |
922 | didWrite = true; | |
923 | } | |
924 | start = end = c; | |
925 | prevCC = cc; | |
926 | } | |
927 | } while(!done); | |
928 | if(didWrite) { | |
929 | fputs("\n", f); | |
930 | } | |
931 | ||
932 | UnicodeSetIterator mIter(norms.mappingSet); | |
933 | start = U_SENTINEL; | |
934 | end = U_SENTINEL; | |
935 | const UnicodeString *prevMapping = nullptr; | |
936 | Norm::MappingType prevType = Norm::NONE; | |
937 | done = false; | |
938 | do { | |
939 | UChar32 c; | |
940 | const Norm *norm; | |
941 | if(mIter.next() && !mIter.isString()) { | |
942 | c = mIter.getCodepoint(); | |
943 | norm = norms.getNorm(c); | |
944 | } else { | |
945 | c = 0x110000; | |
946 | norm = nullptr; | |
947 | done = true; | |
948 | } | |
949 | const UnicodeString *mapping; | |
950 | Norm::MappingType type; | |
951 | if(norm == nullptr) { | |
952 | mapping = nullptr; | |
953 | type = Norm::NONE; | |
954 | } else { | |
955 | type = norm->mappingType; | |
956 | if(type == Norm::NONE) { | |
957 | mapping = nullptr; | |
958 | } else { | |
959 | mapping = norm->mapping; | |
960 | } | |
961 | } | |
962 | if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) { | |
963 | end = c; | |
964 | } else { | |
965 | if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) { | |
966 | if(start == end) { | |
967 | fprintf(f, "%04lX%c", (long)start, typeChars[prevType]); | |
968 | } else { | |
969 | fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]); | |
970 | } | |
971 | writeMapping(f, prevMapping); | |
972 | } | |
973 | start = end = c; | |
974 | prevMapping = mapping; | |
975 | prevType = type; | |
976 | } | |
977 | } while(!done); | |
978 | ||
979 | fclose(f); | |
980 | } | |
981 | ||
982 | void | |
983 | Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1, | |
984 | const Normalizer2DataBuilder &b2, | |
985 | Normalizer2DataBuilder &diff) { | |
986 | // Compute diff = b1 - b2 | |
987 | // so that we should be able to get b1 = b2 + diff. | |
988 | if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) { | |
989 | memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH); | |
990 | } | |
991 | ||
992 | UnicodeSet ccSet(b1.norms.ccSet); | |
993 | ccSet.addAll(b2.norms.ccSet); | |
994 | UnicodeSetIterator ccIter(ccSet); | |
995 | while(ccIter.next() && !ccIter.isString()) { | |
996 | UChar32 c = ccIter.getCodepoint(); | |
997 | uint8_t cc1 = b1.norms.getCC(c); | |
998 | uint8_t cc2 = b2.norms.getCC(c); | |
999 | if(cc1 != cc2) { | |
1000 | diff.setCC(c, cc1); | |
1001 | } | |
1002 | } | |
1003 | ||
1004 | UnicodeSet mSet(b1.norms.mappingSet); | |
1005 | mSet.addAll(b2.norms.mappingSet); | |
1006 | UnicodeSetIterator mIter(mSet); | |
1007 | while(mIter.next() && !mIter.isString()) { | |
1008 | UChar32 c = mIter.getCodepoint(); | |
1009 | const Norm *norm1 = b1.norms.getNorm(c); | |
1010 | const Norm *norm2 = b2.norms.getNorm(c); | |
1011 | const UnicodeString *mapping1; | |
1012 | Norm::MappingType type1; | |
1013 | if(norm1 == nullptr || !norm1->hasMapping()) { | |
1014 | mapping1 = nullptr; | |
1015 | type1 = Norm::NONE; | |
1016 | } else { | |
1017 | mapping1 = norm1->mapping; | |
1018 | type1 = norm1->mappingType; | |
1019 | } | |
1020 | const UnicodeString *mapping2; | |
1021 | Norm::MappingType type2; | |
1022 | if(norm2 == nullptr || !norm2->hasMapping()) { | |
1023 | mapping2 = nullptr; | |
1024 | type2 = Norm::NONE; | |
1025 | } else { | |
1026 | mapping2 = norm2->mapping; | |
1027 | type2 = norm2->mappingType; | |
1028 | } | |
1029 | if(type1 == type2 && equalStrings(mapping1, mapping2)) { | |
1030 | // Nothing to do. | |
1031 | } else if(type1 == Norm::NONE) { | |
1032 | diff.removeMapping(c); | |
1033 | } else if(type1 == Norm::ROUND_TRIP) { | |
1034 | diff.setRoundTripMapping(c, *mapping1); | |
1035 | } else if(type1 == Norm::ONE_WAY) { | |
1036 | diff.setOneWayMapping(c, *mapping1); | |
1037 | } | |
1038 | } | |
1039 | } | |
1040 | ||
729e4ab9 A |
1041 | U_NAMESPACE_END |
1042 | ||
1043 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ | |
1044 | ||
1045 | /* | |
1046 | * Hey, Emacs, please set the following: | |
1047 | * | |
1048 | * Local Variables: | |
1049 | * indent-tabs-mode: nil | |
1050 | * End: | |
1051 | */ |