]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gennorm2/n2builder.cpp
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / tools / gennorm2 / n2builder.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
729e4ab9
A
3/*
4*******************************************************************************
5*
f3c0d7a5 6* Copyright (C) 2009-2016, International Business Machines
729e4ab9
A
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: n2builder.cpp
f3c0d7a5 11* encoding: UTF-8
729e4ab9
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov25
16* created by: Markus W. Scherer
17*
18* Builds Normalizer2 data and writes a binary .nrm file.
19* For the file format see source/common/normalizer2impl.h.
20*/
21
22#include "unicode/utypes.h"
23#include "n2builder.h"
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
729e4ab9 28#include <vector>
729e4ab9
A
29#include "unicode/errorcode.h"
30#include "unicode/localpointer.h"
31#include "unicode/putil.h"
32#include "unicode/udata.h"
33#include "unicode/uniset.h"
34#include "unicode/unistr.h"
0f5d89e8 35#include "unicode/usetiter.h"
729e4ab9 36#include "unicode/ustring.h"
b331163b 37#include "charstr.h"
0f5d89e8 38#include "extradata.h"
729e4ab9
A
39#include "hash.h"
40#include "normalizer2impl.h"
0f5d89e8 41#include "norms.h"
729e4ab9
A
42#include "toolutil.h"
43#include "unewdata.h"
44#include "utrie2.h"
45#include "uvectr32.h"
b331163b 46#include "writesrc.h"
729e4ab9
A
47
48#if !UCONFIG_NO_NORMALIZATION
49
50/* UDataInfo cf. udata.h */
51static UDataInfo dataInfo={
52 sizeof(UDataInfo),
53 0,
54
55 U_IS_BIG_ENDIAN,
56 U_CHARSET_FAMILY,
57 U_SIZEOF_UCHAR,
58 0,
59
60 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
0f5d89e8
A
61 { 3, 0, 0, 0 }, /* formatVersion */
62 { 10, 0, 0, 0 } /* dataVersion (Unicode version) */
729e4ab9
A
63};
64
65U_NAMESPACE_BEGIN
66
67class HangulIterator {
68public:
69 struct Range {
0f5d89e8 70 UChar32 start, end;
729e4ab9
A
71 };
72
73 HangulIterator() : rangeIndex(0) {}
74 const Range *nextRange() {
b331163b 75 if(rangeIndex<UPRV_LENGTHOF(ranges)) {
729e4ab9
A
76 return ranges+rangeIndex++;
77 } else {
78 return NULL;
79 }
80 }
729e4ab9
A
81private:
82 static const Range ranges[4];
83 int32_t rangeIndex;
84};
85
86const HangulIterator::Range HangulIterator::ranges[4]={
0f5d89e8
A
87 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_END },
88 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_END },
729e4ab9 89 // JAMO_T_BASE+1: not U+11A7
0f5d89e8
A
90 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END },
91 { Hangul::HANGUL_BASE, Hangul::HANGUL_END },
729e4ab9
A
92};
93
729e4ab9 94Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
0f5d89e8 95 norms(errorCode),
b331163b 96 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
0f5d89e8 97 norm16Trie(nullptr), norm16TrieLength(0) {
729e4ab9 98 memset(unicodeVersion, 0, sizeof(unicodeVersion));
729e4ab9 99 memset(indexes, 0, sizeof(indexes));
4388f060 100 memset(smallFCD, 0, sizeof(smallFCD));
729e4ab9
A
101}
102
103Normalizer2DataBuilder::~Normalizer2DataBuilder() {
729e4ab9
A
104 utrie2_close(norm16Trie);
105}
106
107void
108Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
4388f060
A
109 UVersionInfo nullVersion={ 0, 0, 0, 0 };
110 UVersionInfo version;
111 u_versionFromString(version, v);
112 if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
113 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
114 ) {
115 char buffer[U_MAX_VERSION_STRING_LENGTH];
116 u_versionToString(unicodeVersion, buffer);
117 fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
118 buffer, v);
119 exit(U_ILLEGAL_ARGUMENT_ERROR);
120 }
121 memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
729e4ab9
A
122}
123
729e4ab9
A
124Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
125 if(p!=NULL) {
126 if(p->mappingType!=Norm::NONE) {
127 if( overrideHandling==OVERRIDE_NONE ||
128 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
129 ) {
130 fprintf(stderr,
131 "error in gennorm2 phase %d: "
132 "not permitted to override mapping for U+%04lX from phase %d\n",
133 (int)phase, (long)c, (int)p->mappingPhase);
134 exit(U_INVALID_FORMAT_ERROR);
135 }
136 delete p->mapping;
137 p->mapping=NULL;
138 }
139 p->mappingPhase=phase;
140 }
141 return p;
142}
143
144void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
145 overrideHandling=oh;
146 ++phase;
147}
148
149void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
0f5d89e8
A
150 norms.createNorm(c)->cc=cc;
151 norms.ccSet.add(c);
729e4ab9
A
152}
153
154static UBool isWellFormed(const UnicodeString &s) {
155 UErrorCode errorCode=U_ZERO_ERROR;
f3c0d7a5 156 u_strToUTF8(NULL, 0, NULL, toUCharPtr(s.getBuffer()), s.length(), &errorCode);
729e4ab9
A
157 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
158}
159
160void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
161 if(!isWellFormed(m)) {
162 fprintf(stderr,
163 "error in gennorm2 phase %d: "
164 "illegal one-way mapping from U+%04lX to malformed string\n",
165 (int)phase, (long)c);
166 exit(U_INVALID_FORMAT_ERROR);
167 }
0f5d89e8 168 Norm *p=checkNormForMapping(norms.createNorm(c), c);
729e4ab9
A
169 p->mapping=new UnicodeString(m);
170 p->mappingType=Norm::ONE_WAY;
171 p->setMappingCP();
0f5d89e8 172 norms.mappingSet.add(c);
729e4ab9
A
173}
174
175void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
176 if(U_IS_SURROGATE(c)) {
177 fprintf(stderr,
178 "error in gennorm2 phase %d: "
179 "illegal round-trip mapping from surrogate code point U+%04lX\n",
180 (int)phase, (long)c);
181 exit(U_INVALID_FORMAT_ERROR);
182 }
183 if(!isWellFormed(m)) {
184 fprintf(stderr,
185 "error in gennorm2 phase %d: "
186 "illegal round-trip mapping from U+%04lX to malformed string\n",
187 (int)phase, (long)c);
188 exit(U_INVALID_FORMAT_ERROR);
189 }
f3c0d7a5 190 int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length());
729e4ab9
A
191 if(numCP!=2) {
192 fprintf(stderr,
193 "error in gennorm2 phase %d: "
194 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
195 (int)phase, (long)c, (int)numCP);
196 exit(U_INVALID_FORMAT_ERROR);
197 }
0f5d89e8 198 Norm *p=checkNormForMapping(norms.createNorm(c), c);
729e4ab9
A
199 p->mapping=new UnicodeString(m);
200 p->mappingType=Norm::ROUND_TRIP;
201 p->mappingCP=U_SENTINEL;
0f5d89e8 202 norms.mappingSet.add(c);
729e4ab9
A
203}
204
205void Normalizer2DataBuilder::removeMapping(UChar32 c) {
0f5d89e8
A
206 // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
207 Norm *p=checkNormForMapping(norms.createNorm(c), c);
208 p->mappingType=Norm::REMOVED;
209 norms.mappingSet.add(c);
729e4ab9
A
210}
211
0f5d89e8
A
212UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer,
213 Norm::MappingType mappingType) const {
729e4ab9 214 if(buffer.isEmpty()) {
0f5d89e8 215 return FALSE; // Maps-to-empty-string is no boundary of any kind.
729e4ab9
A
216 }
217 int32_t lastStarterIndex=buffer.lastStarterIndex();
218 if(lastStarterIndex<0) {
0f5d89e8
A
219 return FALSE; // no starter
220 }
221 const int32_t lastIndex=buffer.length()-1;
222 if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) {
223 // One-way mapping where after the last starter is at least one combining mark
224 // with a combining class greater than 1,
225 // which means that another combining mark can reorder before it.
226 // By contrast, in a round-trip mapping this does not prevent a boundary as long as
227 // the starter or composite does not combine-forward with a following combining mark.
228 return FALSE;
729e4ab9
A
229 }
230 UChar32 starter=buffer.charAt(lastStarterIndex);
0f5d89e8
A
231 if(lastStarterIndex==0 && norms.combinesBack(starter)) {
232 // The last starter is at the beginning of the mapping and combines backward.
233 return FALSE;
234 }
235 if(Hangul::isJamoL(starter) ||
236 (Hangul::isJamoV(starter) &&
237 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
729e4ab9
A
238 // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
239 // otherwise it is blocked.
0f5d89e8 240 return lastStarterIndex!=lastIndex;
729e4ab9 241 }
4388f060 242 // Note: There can be no Hangul syllable in the fully decomposed mapping.
0f5d89e8
A
243
244 // Multiple starters can combine into one.
245 // Look for the first of the last sequence of starters, excluding Jamos.
246 int32_t i=lastStarterIndex;
247 UChar32 c;
248 while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) {
249 starter=c;
250 --i;
251 }
252 // Compose as far as possible, and see if further compositions with
253 // characters following this mapping are possible.
254 const Norm *starterNorm=norms.getNorm(starter);
255 if(i==lastStarterIndex &&
256 (starterNorm==nullptr || starterNorm->compositions==nullptr)) {
257 return TRUE; // The last starter does not combine forward.
729e4ab9 258 }
729e4ab9 259 uint8_t prevCC=0;
0f5d89e8
A
260 while(++i<buffer.length()) {
261 uint8_t cc=buffer.ccAt(i); // !=0 if after last starter
262 if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
263 // The starter combines with a mark that reorders before the current one.
264 return FALSE;
729e4ab9 265 }
0f5d89e8
A
266 UChar32 c=buffer.charAt(i);
267 if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
268 norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) {
269 // The starter combines with c into a composite replacement starter.
270 starterNorm=norms.getNorm(starter);
271 if(i>=lastStarterIndex &&
272 (starterNorm==nullptr || starterNorm->compositions==nullptr)) {
273 return TRUE; // The composite does not combine further.
729e4ab9 274 }
0f5d89e8
A
275 // Keep prevCC because we "removed" the combining mark.
276 } else if(cc==0) {
277 starterNorm=norms.getNorm(c);
278 if(i==lastStarterIndex &&
279 (starterNorm==nullptr || starterNorm->compositions==nullptr)) {
280 return TRUE; // The new starter does not combine forward.
281 }
282 prevCC=0;
729e4ab9
A
283 } else {
284 prevCC=cc;
4388f060 285 }
729e4ab9 286 }
0f5d89e8
A
287 if(prevCC==0) {
288 return FALSE; // forward-combining starter at the very end
729e4ab9 289 }
0f5d89e8
A
290 if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) {
291 // The starter combines with another mark.
292 return FALSE;
729e4ab9 293 }
0f5d89e8 294 return TRUE;
729e4ab9
A
295}
296
0f5d89e8
A
297UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const {
298 if(buffer.lastStarterIndex()<0) {
299 return FALSE; // no starter
729e4ab9 300 }
0f5d89e8
A
301 const Norm *starterNorm=nullptr;
302 uint8_t prevCC=0;
303 for(int32_t i=0; i<buffer.length(); ++i) {
304 UChar32 c=buffer.charAt(i);
305 uint8_t cc=buffer.ccAt(i);
306 if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
307 norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) {
308 return TRUE; // normal composite
309 } else if(cc==0) {
310 if(Hangul::isJamoL(c)) {
311 if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) {
312 return TRUE; // Hangul syllable
313 }
314 starterNorm=nullptr;
729e4ab9 315 } else {
0f5d89e8 316 starterNorm=norms.getNorm(c);
729e4ab9 317 }
729e4ab9 318 }
0f5d89e8 319 prevCC=cc;
729e4ab9 320 }
0f5d89e8 321 return FALSE;
729e4ab9
A
322}
323
0f5d89e8
A
324void Normalizer2DataBuilder::postProcess(Norm &norm) {
325 // Prerequisites: Compositions are built, mappings are recursively decomposed.
326 // Mappings are not yet in canonical order.
327 //
328 // This function works on a Norm struct. We do not know which code point(s) map(s) to it.
329 // Therefore, we cannot compute algorithmic mapping deltas here.
330 // Error conditions are checked, but printed later when we do know the offending code point.
331 if(norm.hasMapping()) {
332 if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) {
333 norm.error="mapping longer than maximum of 31";
334 return;
729e4ab9 335 }
0f5d89e8
A
336 // Ensure canonical order.
337 BuilderReorderingBuffer buffer;
338 if(norm.rawMapping!=nullptr) {
339 norms.reorder(*norm.rawMapping, buffer);
340 buffer.reset();
729e4ab9 341 }
0f5d89e8
A
342 norms.reorder(*norm.mapping, buffer);
343 if(buffer.isEmpty()) {
344 // A character that is deleted (maps to an empty string) must
345 // get the worst-case lccc and tccc values because arbitrary
346 // characters on both sides will become adjacent.
347 norm.leadCC=1;
348 norm.trailCC=0xff;
4388f060 349 } else {
0f5d89e8
A
350 norm.leadCC=buffer.ccAt(0);
351 norm.trailCC=buffer.ccAt(buffer.length()-1);
729e4ab9 352 }
0f5d89e8
A
353
354 norm.hasCompBoundaryBefore=
355 !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0));
356 norm.hasCompBoundaryAfter=
357 norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType);
358
359 if(norm.combinesBack) {
360 norm.error="combines-back and decomposes, not possible in Unicode normalization";
361 } else if(norm.mappingType==Norm::ROUND_TRIP) {
362 if(norm.compositions!=NULL) {
363 norm.type=Norm::YES_NO_COMBINES_FWD;
364 } else {
365 norm.type=Norm::YES_NO_MAPPING_ONLY;
366 }
367 } else { // one-way mapping
368 if(norm.compositions!=NULL) {
369 norm.error="combines-forward and has a one-way mapping, "
370 "not possible in Unicode normalization";
371 } else if(buffer.isEmpty()) {
372 norm.type=Norm::NO_NO_EMPTY;
373 } else if(!norm.hasCompBoundaryBefore) {
374 norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC;
375 } else if(mappingRecomposes(buffer)) {
376 norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE;
377 } else {
378 // The mapping is comp-normalized.
379 norm.type=Norm::NO_NO_COMP_YES;
729e4ab9
A
380 }
381 }
0f5d89e8
A
382 } else { // no mapping
383 norm.leadCC=norm.trailCC=norm.cc;
384
385 norm.hasCompBoundaryBefore=
386 norm.cc==0 && !norm.combinesBack;
387 norm.hasCompBoundaryAfter=
388 norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr;
389
390 if(norm.combinesBack) {
391 if(norm.compositions!=nullptr) {
392 // Earlier code checked ccc=0.
393 norm.type=Norm::MAYBE_YES_COMBINES_FWD;
729e4ab9 394 } else {
0f5d89e8 395 norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc
729e4ab9 396 }
0f5d89e8
A
397 } else if(norm.compositions!=nullptr) {
398 // Earlier code checked ccc=0.
399 norm.type=Norm::YES_YES_COMBINES_FWD;
400 } else if(norm.cc!=0) {
401 norm.type=Norm::YES_YES_WITH_CC;
402 } else {
403 norm.type=Norm::INERT;
729e4ab9
A
404 }
405 }
406}
407
0f5d89e8 408class Norm16Writer : public Norms::Enumerator {
729e4ab9 409public:
0f5d89e8
A
410 Norm16Writer(Norms &n, Normalizer2DataBuilder &b) : Norms::Enumerator(n), builder(b) {}
411 void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE {
412 builder.writeNorm16(start, end, norm);
729e4ab9 413 }
0f5d89e8 414 Normalizer2DataBuilder &builder;
729e4ab9
A
415};
416
0f5d89e8
A
417void Normalizer2DataBuilder::setSmallFCD(UChar32 c) {
418 UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
419 smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
420}
421
422void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm) {
423 if((norm.leadCC|norm.trailCC)!=0) {
424 for(UChar32 c=start; c<=end; ++c) {
425 setSmallFCD(c);
426 }
427 }
428
429 int32_t norm16;
430 switch(norm.type) {
431 case Norm::INERT:
432 norm16=Normalizer2Impl::INERT;
433 break;
434 case Norm::YES_YES_COMBINES_FWD:
435 norm16=norm.offset*2;
436 break;
437 case Norm::YES_NO_COMBINES_FWD:
438 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2;
439 break;
440 case Norm::YES_NO_MAPPING_ONLY:
441 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2;
442 break;
443 case Norm::NO_NO_COMP_YES:
444 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2;
445 break;
446 case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
447 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2;
448 break;
449 case Norm::NO_NO_COMP_NO_MAYBE_CC:
450 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2;
451 break;
452 case Norm::NO_NO_EMPTY:
453 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2;
454 break;
455 case Norm::NO_NO_DELTA:
456 {
457 // Positive offset from minNoNoDelta, shifted left for additional bits.
458 int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT;
459 if(norm.trailCC==0) {
460 // DELTA_TCCC_0==0
461 } else if(norm.trailCC==1) {
462 offset|=Normalizer2Impl::DELTA_TCCC_1;
463 } else {
464 offset|=Normalizer2Impl::DELTA_TCCC_GT_1;
729e4ab9 465 }
0f5d89e8 466 norm16=getMinNoNoDelta()+offset;
729e4ab9 467 break;
729e4ab9 468 }
0f5d89e8
A
469 case Norm::MAYBE_YES_COMBINES_FWD:
470 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2;
471 break;
472 case Norm::MAYBE_YES_SIMPLE:
473 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255
474 break;
475 case Norm::YES_YES_WITH_CC:
476 U_ASSERT(norm.cc!=0);
477 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255
478 break;
479 default: // Should not occur.
480 exit(U_INTERNAL_PROGRAM_ERROR);
481 }
482 U_ASSERT((norm16&1)==0);
483 if(norm.hasCompBoundaryAfter) {
484 norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
485 }
486 IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
487 utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
488
489 // Set the minimum code points for real data lookups in the quick check loops.
490 UBool isDecompNo=
491 (Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) ||
492 norm.cc!=0;
493 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
494 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
495 }
496 UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES;
497 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
498 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
499 }
500 if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) {
501 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start;
729e4ab9
A
502 }
503}
504
505void Normalizer2DataBuilder::setHangulData() {
506 HangulIterator hi;
507 const HangulIterator::Range *range;
508 // Check that none of the Hangul/Jamo code points have data.
509 while((range=hi.nextRange())!=NULL) {
0f5d89e8
A
510 for(UChar32 c=range->start; c<=range->end; ++c) {
511 if(utrie2_get32(norm16Trie, c)>Normalizer2Impl::INERT) {
729e4ab9
A
512 fprintf(stderr,
513 "gennorm2 error: "
514 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
515 (long)c);
516 exit(U_INVALID_FORMAT_ERROR);
517 }
518 }
519 }
520 // Set data for algorithmic runtime handling.
521 IcuToolErrorCode errorCode("gennorm2/setHangulData()");
0f5d89e8
A
522
523 // Jamo V/T are maybeYes
524 if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
525 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE;
729e4ab9 526 }
0f5d89e8
A
527 utrie2_setRange32(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END,
528 Normalizer2Impl::JAMO_L, TRUE, errorCode);
529 utrie2_setRange32(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END,
530 Normalizer2Impl::JAMO_VT, TRUE, errorCode);
531 // JAMO_T_BASE+1: not U+11A7
532 utrie2_setRange32(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END,
533 Normalizer2Impl::JAMO_VT, TRUE, errorCode);
534
535 // Hangul LV encoded as minYesNo
536 uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO];
537 // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER
538 uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]|
539 Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
540 if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
541 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE;
542 }
543 // Set the first LV, then write all other Hangul syllables as LVT,
544 // then overwrite the remaining LV.
545 // The UTrie2 should be able to compact this into 7 32-item blocks
546 // because JAMO_T_COUNT is 28 and the UTrie2 granularity is 4.
547 // (7*32=8*28 smallest common multiple)
548 utrie2_set32(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode);
549 utrie2_setRange32(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END,
550 lvt, TRUE, errorCode);
551 UChar32 c=Hangul::HANGUL_BASE;
552 while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) {
553 utrie2_set32(norm16Trie, c, lv, errorCode);
554 }
555 errorCode.assertSuccess();
729e4ab9
A
556}
557
0f5d89e8
A
558namespace {
559
560struct Norm16Summary {
561 uint32_t maxNorm16;
562 // ANDing values yields 0 bits where any value has a 0.
563 // Used for worst-case HAS_COMP_BOUNDARY_AFTER.
564 uint32_t andedNorm16;
565};
566
567} // namespace
568
729e4ab9
A
569U_CDECL_BEGIN
570
571static UBool U_CALLCONV
572enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
0f5d89e8
A
573 Norm16Summary *p=(Norm16Summary *)context;
574 if(value>p->maxNorm16) {
575 p->maxNorm16=value;
729e4ab9 576 }
0f5d89e8 577 p->andedNorm16&=value;
729e4ab9
A
578 return TRUE;
579}
580
581U_CDECL_END
582
583void Normalizer2DataBuilder::processData() {
584 IcuToolErrorCode errorCode("gennorm2/processData()");
0f5d89e8 585 norm16Trie=utrie2_open(Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode);
729e4ab9
A
586 errorCode.assertSuccess();
587
0f5d89e8
A
588 // Build composition lists before recursive decomposition,
589 // so that we still have the raw, pair-wise mappings.
590 CompositionBuilder compBuilder(norms);
591 norms.enumRanges(compBuilder);
729e4ab9 592
0f5d89e8
A
593 // Recursively decompose all mappings.
594 Decomposer decomposer(norms);
729e4ab9
A
595 do {
596 decomposer.didDecompose=FALSE;
0f5d89e8 597 norms.enumRanges(decomposer);
729e4ab9
A
598 } while(decomposer.didDecompose);
599
0f5d89e8
A
600 // Set the Norm::Type and other properties.
601 int32_t normsLength=norms.length();
729e4ab9 602 for(int32_t i=1; i<normsLength; ++i) {
0f5d89e8 603 postProcess(norms.getNormRefByIndex(i));
729e4ab9
A
604 }
605
0f5d89e8
A
606 // Write the properties, mappings and composition lists to
607 // appropriate parts of the "extra data" array.
608 ExtraData extra(norms, optimization==OPTIMIZE_FAST);
609 norms.enumRanges(extra);
610
611 extraData=extra.yesYesCompositions;
612 indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2;
613 extraData.append(extra.yesNoMappingsAndCompositions);
614 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2;
615 extraData.append(extra.yesNoMappingsOnly);
616 indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2;
617 extraData.append(extra.noNoMappingsCompYes);
618 indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2;
619 extraData.append(extra.noNoMappingsCompBoundaryBefore);
620 indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2;
621 extraData.append(extra.noNoMappingsCompNoMaybeCC);
622 indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2;
623 extraData.append(extra.noNoMappingsEmpty);
624 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2;
625
626 // Pad the maybeYesCompositions length to a multiple of 4,
627 // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
628 while(extra.maybeYesCompositions.length()&3) {
629 extra.maybeYesCompositions.append((UChar)0);
630 }
631 extraData.insert(0, extra.maybeYesCompositions);
632 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
633 Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
634 extra.maybeYesCompositions.length()*2;
729e4ab9 635
729e4ab9
A
636 // Pad to even length for 4-byte alignment of following data.
637 if(extraData.length()&1) {
638 extraData.append((UChar)0);
639 }
640
0f5d89e8
A
641 int32_t minNoNoDelta=getMinNoNoDelta();
642 U_ASSERT((minNoNoDelta&7)==0);
729e4ab9
A
643 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
644 fprintf(stderr,
645 "gennorm2 error: "
646 "data structure overflow, too much mapping composition data\n");
647 exit(U_BUFFER_OVERFLOW_ERROR);
648 }
649
0f5d89e8
A
650 // writeNorm16() and setHangulData() reduce these as needed.
651 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
652 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
653 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000;
654
655 // Map each code point to its norm16 value,
656 // including the properties that fit directly,
657 // and the offset to the "extra data" if necessary.
658 Norm16Writer norm16Writer(norms, *this);
659 norms.enumRanges(norm16Writer);
729e4ab9
A
660
661 setHangulData();
662
663 // Look for the "worst" norm16 value of any supplementary code point
664 // corresponding to a lead surrogate, and set it as that surrogate's value.
0f5d89e8 665 // Enables UTF-16 quick check inner loops to look at only code units.
729e4ab9
A
666 //
667 // We could be more sophisticated:
668 // We could collect a bit set for whether there are values in the different
669 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
670 // and select the best value that only breaks the composition and/or decomposition
671 // inner loops if necessary.
672 // However, that seems like overkill for an optimization for supplementary characters.
673 for(UChar lead=0xd800; lead<0xdc00; ++lead) {
0f5d89e8
A
674 uint32_t surrogateCPNorm16=utrie2_get32(norm16Trie, lead);
675 Norm16Summary summary={ surrogateCPNorm16, surrogateCPNorm16 };
676 utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &summary);
677 uint32_t norm16=summary.maxNorm16;
678 if(norm16>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
679 norm16>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]) {
729e4ab9
A
680 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
681 // Otherwise it might end up at something like JAMO_VT which stays in
682 // the inner decomposition quick check loop.
0f5d89e8 683 norm16=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
729e4ab9 684 }
0f5d89e8
A
685 norm16=
686 (norm16&~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)|
687 (summary.andedNorm16&Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER);
688 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, norm16, errorCode);
729e4ab9
A
689 }
690
691 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
692 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
693 // which is harmless.
694 // As a result, the minimum code points are always BMP code points.
695 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
696 if(minCP>=0x10000) {
697 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
698 }
699 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
700 if(minCP>=0x10000) {
701 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
702 }
0f5d89e8
A
703 minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP];
704 if(minCP>=0x10000) {
705 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP);
706 }
729e4ab9 707
729e4ab9 708 utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
b331163b 709 norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
729e4ab9
A
710 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
711 fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
712 errorCode.errorName());
713 exit(errorCode.reset());
714 }
715 errorCode.reset();
729e4ab9
A
716
717 int32_t offset=(int32_t)sizeof(indexes);
718 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
719 offset+=norm16TrieLength;
720 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
4388f060
A
721 offset+=extraData.length()*2;
722 indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
723 offset+=sizeof(smallFCD);
724 int32_t totalSize=offset;
725 for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
729e4ab9
A
726 indexes[i]=totalSize;
727 }
728
729 if(beVerbose) {
730 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength);
731 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length());
4388f060 732 printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD));
729e4ab9
A
733 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize);
734 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
735 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
0f5d89e8
A
736 printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]);
737 printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
4388f060 738 printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
0f5d89e8
A
739 printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
740 printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
741 printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
742 printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]);
729e4ab9 743 printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
0f5d89e8 744 printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta);
729e4ab9
A
745 printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
746 }
747
4388f060
A
748 UVersionInfo nullVersion={ 0, 0, 0, 0 };
749 if(0==memcmp(nullVersion, unicodeVersion, 4)) {
750 u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
751 }
729e4ab9 752 memcpy(dataInfo.dataVersion, unicodeVersion, 4);
b331163b
A
753}
754
755void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
756 processData();
757
758 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
759 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
760 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
761 errorCode.assertSuccess();
762
729e4ab9
A
763 UNewDataMemory *pData=
764 udata_create(NULL, NULL, filename, &dataInfo,
765 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
766 if(errorCode.isFailure()) {
767 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
768 filename, errorCode.errorName());
769 exit(errorCode.reset());
770 }
771 udata_writeBlock(pData, indexes, sizeof(indexes));
772 udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
f3c0d7a5 773 udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length());
4388f060 774 udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
729e4ab9
A
775 int32_t writtenSize=udata_finish(pData, errorCode);
776 if(errorCode.isFailure()) {
777 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
778 exit(errorCode.reset());
779 }
b331163b 780 int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
729e4ab9
A
781 if(writtenSize!=totalSize) {
782 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
783 (long)writtenSize, (long)totalSize);
784 exit(U_INTERNAL_PROGRAM_ERROR);
785 }
786}
787
b331163b
A
788void
789Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
790 processData();
791
792 IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()");
793 const char *basename=findBasename(filename);
794 CharString path(filename, (int32_t)(basename-filename), errorCode);
795 CharString dataName(basename, errorCode);
796 const char *extension=strrchr(basename, '.');
797 if(extension!=NULL) {
798 dataName.truncate((int32_t)(extension-basename));
799 }
800 errorCode.assertSuccess();
801
802 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
803 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
804 errorCode.assertSuccess();
805
806 FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2builder.cpp");
807 if(f==NULL) {
808 fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
809 filename);
810 exit(U_FILE_ACCESS_ERROR);
811 return;
812 }
f3c0d7a5 813 fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f);
b331163b
A
814 char line[100];
815 sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data());
816 usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n");
817 sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data());
818 usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n");
819 sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n",
820 dataName.data());
821 usrc_writeArray(f,
822 line,
823 indexes, 32, Normalizer2Impl::IX_COUNT,
824 "\n};\n\n");
825 sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data());
826 usrc_writeUTrie2Arrays(f,
827 line, NULL,
828 norm16Trie,
829 "\n};\n\n");
830 sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data());
831 usrc_writeArray(f,
832 line,
833 extraData.getBuffer(), 16, extraData.length(),
834 "\n};\n\n");
835 sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data());
836 usrc_writeArray(f,
837 line,
838 smallFCD, 8, sizeof(smallFCD),
839 "\n};\n\n");
b331163b
A
840 sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data());
841 char line2[100];
842 sprintf(line2, "%s_trieIndex", dataName.data());
843 usrc_writeUTrie2Struct(f,
844 line,
845 norm16Trie, line2, NULL,
846 "};\n");
f3c0d7a5 847 fputs("\n#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f);
b331163b
A
848 fclose(f);
849}
850
0f5d89e8
A
851namespace {
852
853bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
854 if(s1 == nullptr) {
855 return s2 == nullptr;
856 } else if(s2 == nullptr) {
857 return false;
858 } else {
859 return *s1 == *s2;
860 }
861}
862
863const char *typeChars = "?-=>";
864
865void writeMapping(FILE *f, const UnicodeString *m) {
866 if(m != nullptr && !m->isEmpty()) {
867 int32_t i = 0;
868 UChar32 c = m->char32At(i);
869 fprintf(f, "%04lX", (long)c);
870 while((i += U16_LENGTH(c)) < m->length()) {
871 c = m->char32At(i);
872 fprintf(f, " %04lX", (long)c);
873 }
874 }
875 fputs("\n", f);
876}
877
878} // namespace
879
880void
881Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
882 // Do not processData() before writing the input-syntax data file.
883 FILE *f = fopen(filename, "w");
884 if(f == nullptr) {
885 fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
886 filename);
887 exit(U_FILE_ACCESS_ERROR);
888 return;
889 }
890
891 if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
892 unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
893 char uv[U_MAX_VERSION_STRING_LENGTH];
894 u_versionToString(unicodeVersion, uv);
895 fprintf(f, "* Unicode %s\n\n", uv);
896 }
897
898 UnicodeSetIterator ccIter(norms.ccSet);
899 UChar32 start = U_SENTINEL;
900 UChar32 end = U_SENTINEL;
901 uint8_t prevCC = 0;
902 bool done = false;
903 bool didWrite = false;
904 do {
905 UChar32 c;
906 uint8_t cc;
907 if(ccIter.next() && !ccIter.isString()) {
908 c = ccIter.getCodepoint();
909 cc = norms.getCC(c);
910 } else {
911 c = 0x110000;
912 cc = 0;
913 done = true;
914 }
915 if(cc == prevCC && c == (end + 1)) {
916 end = c;
917 } else {
918 if(prevCC != 0) {
919 if(start == end) {
920 fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
921 } else {
922 fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
923 }
924 didWrite = true;
925 }
926 start = end = c;
927 prevCC = cc;
928 }
929 } while(!done);
930 if(didWrite) {
931 fputs("\n", f);
932 }
933
934 UnicodeSetIterator mIter(norms.mappingSet);
935 start = U_SENTINEL;
936 end = U_SENTINEL;
937 const UnicodeString *prevMapping = nullptr;
938 Norm::MappingType prevType = Norm::NONE;
939 done = false;
940 do {
941 UChar32 c;
942 const Norm *norm;
943 if(mIter.next() && !mIter.isString()) {
944 c = mIter.getCodepoint();
945 norm = norms.getNorm(c);
946 } else {
947 c = 0x110000;
948 norm = nullptr;
949 done = true;
950 }
951 const UnicodeString *mapping;
952 Norm::MappingType type;
953 if(norm == nullptr) {
954 mapping = nullptr;
955 type = Norm::NONE;
956 } else {
957 type = norm->mappingType;
958 if(type == Norm::NONE) {
959 mapping = nullptr;
960 } else {
961 mapping = norm->mapping;
962 }
963 }
964 if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
965 end = c;
966 } else {
967 if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
968 if(start == end) {
969 fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
970 } else {
971 fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
972 }
973 writeMapping(f, prevMapping);
974 }
975 start = end = c;
976 prevMapping = mapping;
977 prevType = type;
978 }
979 } while(!done);
980
981 fclose(f);
982}
983
984void
985Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
986 const Normalizer2DataBuilder &b2,
987 Normalizer2DataBuilder &diff) {
988 // Compute diff = b1 - b2
989 // so that we should be able to get b1 = b2 + diff.
990 if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
991 memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
992 }
993
994 UnicodeSet ccSet(b1.norms.ccSet);
995 ccSet.addAll(b2.norms.ccSet);
996 UnicodeSetIterator ccIter(ccSet);
997 while(ccIter.next() && !ccIter.isString()) {
998 UChar32 c = ccIter.getCodepoint();
999 uint8_t cc1 = b1.norms.getCC(c);
1000 uint8_t cc2 = b2.norms.getCC(c);
1001 if(cc1 != cc2) {
1002 diff.setCC(c, cc1);
1003 }
1004 }
1005
1006 UnicodeSet mSet(b1.norms.mappingSet);
1007 mSet.addAll(b2.norms.mappingSet);
1008 UnicodeSetIterator mIter(mSet);
1009 while(mIter.next() && !mIter.isString()) {
1010 UChar32 c = mIter.getCodepoint();
1011 const Norm *norm1 = b1.norms.getNorm(c);
1012 const Norm *norm2 = b2.norms.getNorm(c);
1013 const UnicodeString *mapping1;
1014 Norm::MappingType type1;
1015 if(norm1 == nullptr || !norm1->hasMapping()) {
1016 mapping1 = nullptr;
1017 type1 = Norm::NONE;
1018 } else {
1019 mapping1 = norm1->mapping;
1020 type1 = norm1->mappingType;
1021 }
1022 const UnicodeString *mapping2;
1023 Norm::MappingType type2;
1024 if(norm2 == nullptr || !norm2->hasMapping()) {
1025 mapping2 = nullptr;
1026 type2 = Norm::NONE;
1027 } else {
1028 mapping2 = norm2->mapping;
1029 type2 = norm2->mappingType;
1030 }
1031 if(type1 == type2 && equalStrings(mapping1, mapping2)) {
1032 // Nothing to do.
1033 } else if(type1 == Norm::NONE) {
1034 diff.removeMapping(c);
1035 } else if(type1 == Norm::ROUND_TRIP) {
1036 diff.setRoundTripMapping(c, *mapping1);
1037 } else if(type1 == Norm::ONE_WAY) {
1038 diff.setOneWayMapping(c, *mapping1);
1039 }
1040 }
1041}
1042
729e4ab9
A
1043U_NAMESPACE_END
1044
1045#endif /* #if !UCONFIG_NO_NORMALIZATION */
1046
1047/*
1048 * Hey, Emacs, please set the following:
1049 *
1050 * Local Variables:
1051 * indent-tabs-mode: nil
1052 * End:
1053 */