]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gennorm2/n2builder.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / tools / gennorm2 / n2builder.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
729e4ab9
A
3/*
4*******************************************************************************
5*
f3c0d7a5 6* Copyright (C) 2009-2016, International Business Machines
729e4ab9
A
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: n2builder.cpp
f3c0d7a5 11* encoding: UTF-8
729e4ab9
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov25
16* created by: Markus W. Scherer
17*
18* Builds Normalizer2 data and writes a binary .nrm file.
19* For the file format see source/common/normalizer2impl.h.
20*/
21
22#include "unicode/utypes.h"
23#include "n2builder.h"
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
729e4ab9 28#include <vector>
729e4ab9
A
29#include "unicode/errorcode.h"
30#include "unicode/localpointer.h"
31#include "unicode/putil.h"
3d1f044b 32#include "unicode/ucptrie.h"
729e4ab9 33#include "unicode/udata.h"
3d1f044b 34#include "unicode/umutablecptrie.h"
729e4ab9
A
35#include "unicode/uniset.h"
36#include "unicode/unistr.h"
0f5d89e8 37#include "unicode/usetiter.h"
729e4ab9 38#include "unicode/ustring.h"
b331163b 39#include "charstr.h"
0f5d89e8 40#include "extradata.h"
729e4ab9
A
41#include "hash.h"
42#include "normalizer2impl.h"
0f5d89e8 43#include "norms.h"
729e4ab9
A
44#include "toolutil.h"
45#include "unewdata.h"
729e4ab9 46#include "uvectr32.h"
b331163b 47#include "writesrc.h"
729e4ab9
A
48
49#if !UCONFIG_NO_NORMALIZATION
50
51/* UDataInfo cf. udata.h */
52static UDataInfo dataInfo={
53 sizeof(UDataInfo),
54 0,
55
56 U_IS_BIG_ENDIAN,
57 U_CHARSET_FAMILY,
58 U_SIZEOF_UCHAR,
59 0,
60
61 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
3d1f044b
A
62 { 4, 0, 0, 0 }, /* formatVersion */
63 { 11, 0, 0, 0 } /* dataVersion (Unicode version) */
729e4ab9
A
64};
65
66U_NAMESPACE_BEGIN
67
68class HangulIterator {
69public:
70 struct Range {
0f5d89e8 71 UChar32 start, end;
729e4ab9
A
72 };
73
74 HangulIterator() : rangeIndex(0) {}
75 const Range *nextRange() {
b331163b 76 if(rangeIndex<UPRV_LENGTHOF(ranges)) {
729e4ab9
A
77 return ranges+rangeIndex++;
78 } else {
79 return NULL;
80 }
81 }
729e4ab9
A
82private:
83 static const Range ranges[4];
84 int32_t rangeIndex;
85};
86
87const HangulIterator::Range HangulIterator::ranges[4]={
0f5d89e8
A
88 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_END },
89 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_END },
729e4ab9 90 // JAMO_T_BASE+1: not U+11A7
0f5d89e8
A
91 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END },
92 { Hangul::HANGUL_BASE, Hangul::HANGUL_END },
729e4ab9
A
93};
94
729e4ab9 95Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
0f5d89e8 96 norms(errorCode),
b331163b 97 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
3d1f044b 98 norm16TrieBytes(nullptr), norm16TrieLength(0) {
729e4ab9 99 memset(unicodeVersion, 0, sizeof(unicodeVersion));
729e4ab9 100 memset(indexes, 0, sizeof(indexes));
4388f060 101 memset(smallFCD, 0, sizeof(smallFCD));
729e4ab9
A
102}
103
104Normalizer2DataBuilder::~Normalizer2DataBuilder() {
3d1f044b 105 delete[] norm16TrieBytes;
729e4ab9
A
106}
107
108void
109Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
4388f060
A
110 UVersionInfo nullVersion={ 0, 0, 0, 0 };
111 UVersionInfo version;
112 u_versionFromString(version, v);
113 if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
114 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
115 ) {
116 char buffer[U_MAX_VERSION_STRING_LENGTH];
117 u_versionToString(unicodeVersion, buffer);
118 fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
119 buffer, v);
120 exit(U_ILLEGAL_ARGUMENT_ERROR);
121 }
122 memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
729e4ab9
A
123}
124
729e4ab9
A
125Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
126 if(p!=NULL) {
127 if(p->mappingType!=Norm::NONE) {
128 if( overrideHandling==OVERRIDE_NONE ||
129 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
130 ) {
131 fprintf(stderr,
132 "error in gennorm2 phase %d: "
133 "not permitted to override mapping for U+%04lX from phase %d\n",
134 (int)phase, (long)c, (int)p->mappingPhase);
135 exit(U_INVALID_FORMAT_ERROR);
136 }
137 delete p->mapping;
138 p->mapping=NULL;
139 }
140 p->mappingPhase=phase;
141 }
142 return p;
143}
144
145void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
146 overrideHandling=oh;
147 ++phase;
148}
149
150void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
0f5d89e8
A
151 norms.createNorm(c)->cc=cc;
152 norms.ccSet.add(c);
729e4ab9
A
153}
154
155static UBool isWellFormed(const UnicodeString &s) {
156 UErrorCode errorCode=U_ZERO_ERROR;
f3c0d7a5 157 u_strToUTF8(NULL, 0, NULL, toUCharPtr(s.getBuffer()), s.length(), &errorCode);
729e4ab9
A
158 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
159}
160
161void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
162 if(!isWellFormed(m)) {
163 fprintf(stderr,
164 "error in gennorm2 phase %d: "
165 "illegal one-way mapping from U+%04lX to malformed string\n",
166 (int)phase, (long)c);
167 exit(U_INVALID_FORMAT_ERROR);
168 }
0f5d89e8 169 Norm *p=checkNormForMapping(norms.createNorm(c), c);
729e4ab9
A
170 p->mapping=new UnicodeString(m);
171 p->mappingType=Norm::ONE_WAY;
172 p->setMappingCP();
0f5d89e8 173 norms.mappingSet.add(c);
729e4ab9
A
174}
175
176void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
177 if(U_IS_SURROGATE(c)) {
178 fprintf(stderr,
179 "error in gennorm2 phase %d: "
180 "illegal round-trip mapping from surrogate code point U+%04lX\n",
181 (int)phase, (long)c);
182 exit(U_INVALID_FORMAT_ERROR);
183 }
184 if(!isWellFormed(m)) {
185 fprintf(stderr,
186 "error in gennorm2 phase %d: "
187 "illegal round-trip mapping from U+%04lX to malformed string\n",
188 (int)phase, (long)c);
189 exit(U_INVALID_FORMAT_ERROR);
190 }
f3c0d7a5 191 int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length());
729e4ab9
A
192 if(numCP!=2) {
193 fprintf(stderr,
194 "error in gennorm2 phase %d: "
195 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
196 (int)phase, (long)c, (int)numCP);
197 exit(U_INVALID_FORMAT_ERROR);
198 }
0f5d89e8 199 Norm *p=checkNormForMapping(norms.createNorm(c), c);
729e4ab9
A
200 p->mapping=new UnicodeString(m);
201 p->mappingType=Norm::ROUND_TRIP;
202 p->mappingCP=U_SENTINEL;
0f5d89e8 203 norms.mappingSet.add(c);
729e4ab9
A
204}
205
206void Normalizer2DataBuilder::removeMapping(UChar32 c) {
0f5d89e8
A
207 // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
208 Norm *p=checkNormForMapping(norms.createNorm(c), c);
209 p->mappingType=Norm::REMOVED;
210 norms.mappingSet.add(c);
729e4ab9
A
211}
212
0f5d89e8
A
213UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer,
214 Norm::MappingType mappingType) const {
729e4ab9 215 if(buffer.isEmpty()) {
0f5d89e8 216 return FALSE; // Maps-to-empty-string is no boundary of any kind.
729e4ab9
A
217 }
218 int32_t lastStarterIndex=buffer.lastStarterIndex();
219 if(lastStarterIndex<0) {
0f5d89e8
A
220 return FALSE; // no starter
221 }
222 const int32_t lastIndex=buffer.length()-1;
223 if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) {
224 // One-way mapping where after the last starter is at least one combining mark
225 // with a combining class greater than 1,
226 // which means that another combining mark can reorder before it.
227 // By contrast, in a round-trip mapping this does not prevent a boundary as long as
228 // the starter or composite does not combine-forward with a following combining mark.
229 return FALSE;
729e4ab9
A
230 }
231 UChar32 starter=buffer.charAt(lastStarterIndex);
0f5d89e8
A
232 if(lastStarterIndex==0 && norms.combinesBack(starter)) {
233 // The last starter is at the beginning of the mapping and combines backward.
234 return FALSE;
235 }
236 if(Hangul::isJamoL(starter) ||
237 (Hangul::isJamoV(starter) &&
238 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
729e4ab9
A
239 // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
240 // otherwise it is blocked.
0f5d89e8 241 return lastStarterIndex!=lastIndex;
729e4ab9 242 }
4388f060 243 // Note: There can be no Hangul syllable in the fully decomposed mapping.
0f5d89e8
A
244
245 // Multiple starters can combine into one.
246 // Look for the first of the last sequence of starters, excluding Jamos.
247 int32_t i=lastStarterIndex;
248 UChar32 c;
249 while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) {
250 starter=c;
251 --i;
252 }
253 // Compose as far as possible, and see if further compositions with
254 // characters following this mapping are possible.
255 const Norm *starterNorm=norms.getNorm(starter);
256 if(i==lastStarterIndex &&
257 (starterNorm==nullptr || starterNorm->compositions==nullptr)) {
258 return TRUE; // The last starter does not combine forward.
729e4ab9 259 }
729e4ab9 260 uint8_t prevCC=0;
0f5d89e8
A
261 while(++i<buffer.length()) {
262 uint8_t cc=buffer.ccAt(i); // !=0 if after last starter
263 if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
264 // The starter combines with a mark that reorders before the current one.
265 return FALSE;
729e4ab9 266 }
0f5d89e8
A
267 UChar32 c=buffer.charAt(i);
268 if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
269 norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) {
270 // The starter combines with c into a composite replacement starter.
271 starterNorm=norms.getNorm(starter);
272 if(i>=lastStarterIndex &&
273 (starterNorm==nullptr || starterNorm->compositions==nullptr)) {
274 return TRUE; // The composite does not combine further.
729e4ab9 275 }
0f5d89e8
A
276 // Keep prevCC because we "removed" the combining mark.
277 } else if(cc==0) {
278 starterNorm=norms.getNorm(c);
279 if(i==lastStarterIndex &&
280 (starterNorm==nullptr || starterNorm->compositions==nullptr)) {
281 return TRUE; // The new starter does not combine forward.
282 }
283 prevCC=0;
729e4ab9
A
284 } else {
285 prevCC=cc;
4388f060 286 }
729e4ab9 287 }
0f5d89e8
A
288 if(prevCC==0) {
289 return FALSE; // forward-combining starter at the very end
729e4ab9 290 }
0f5d89e8
A
291 if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) {
292 // The starter combines with another mark.
293 return FALSE;
729e4ab9 294 }
0f5d89e8 295 return TRUE;
729e4ab9
A
296}
297
0f5d89e8
A
298UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const {
299 if(buffer.lastStarterIndex()<0) {
300 return FALSE; // no starter
729e4ab9 301 }
0f5d89e8
A
302 const Norm *starterNorm=nullptr;
303 uint8_t prevCC=0;
304 for(int32_t i=0; i<buffer.length(); ++i) {
305 UChar32 c=buffer.charAt(i);
306 uint8_t cc=buffer.ccAt(i);
307 if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
308 norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) {
309 return TRUE; // normal composite
310 } else if(cc==0) {
311 if(Hangul::isJamoL(c)) {
312 if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) {
313 return TRUE; // Hangul syllable
314 }
315 starterNorm=nullptr;
729e4ab9 316 } else {
0f5d89e8 317 starterNorm=norms.getNorm(c);
729e4ab9 318 }
729e4ab9 319 }
0f5d89e8 320 prevCC=cc;
729e4ab9 321 }
0f5d89e8 322 return FALSE;
729e4ab9
A
323}
324
0f5d89e8
A
325void Normalizer2DataBuilder::postProcess(Norm &norm) {
326 // Prerequisites: Compositions are built, mappings are recursively decomposed.
327 // Mappings are not yet in canonical order.
328 //
329 // This function works on a Norm struct. We do not know which code point(s) map(s) to it.
330 // Therefore, we cannot compute algorithmic mapping deltas here.
331 // Error conditions are checked, but printed later when we do know the offending code point.
332 if(norm.hasMapping()) {
333 if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) {
334 norm.error="mapping longer than maximum of 31";
335 return;
729e4ab9 336 }
0f5d89e8
A
337 // Ensure canonical order.
338 BuilderReorderingBuffer buffer;
339 if(norm.rawMapping!=nullptr) {
340 norms.reorder(*norm.rawMapping, buffer);
341 buffer.reset();
729e4ab9 342 }
0f5d89e8
A
343 norms.reorder(*norm.mapping, buffer);
344 if(buffer.isEmpty()) {
345 // A character that is deleted (maps to an empty string) must
346 // get the worst-case lccc and tccc values because arbitrary
347 // characters on both sides will become adjacent.
348 norm.leadCC=1;
349 norm.trailCC=0xff;
4388f060 350 } else {
0f5d89e8
A
351 norm.leadCC=buffer.ccAt(0);
352 norm.trailCC=buffer.ccAt(buffer.length()-1);
729e4ab9 353 }
0f5d89e8
A
354
355 norm.hasCompBoundaryBefore=
356 !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0));
357 norm.hasCompBoundaryAfter=
358 norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType);
359
360 if(norm.combinesBack) {
361 norm.error="combines-back and decomposes, not possible in Unicode normalization";
362 } else if(norm.mappingType==Norm::ROUND_TRIP) {
363 if(norm.compositions!=NULL) {
364 norm.type=Norm::YES_NO_COMBINES_FWD;
365 } else {
366 norm.type=Norm::YES_NO_MAPPING_ONLY;
367 }
368 } else { // one-way mapping
369 if(norm.compositions!=NULL) {
370 norm.error="combines-forward and has a one-way mapping, "
371 "not possible in Unicode normalization";
372 } else if(buffer.isEmpty()) {
373 norm.type=Norm::NO_NO_EMPTY;
374 } else if(!norm.hasCompBoundaryBefore) {
375 norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC;
376 } else if(mappingRecomposes(buffer)) {
377 norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE;
378 } else {
379 // The mapping is comp-normalized.
380 norm.type=Norm::NO_NO_COMP_YES;
729e4ab9
A
381 }
382 }
0f5d89e8
A
383 } else { // no mapping
384 norm.leadCC=norm.trailCC=norm.cc;
385
386 norm.hasCompBoundaryBefore=
387 norm.cc==0 && !norm.combinesBack;
388 norm.hasCompBoundaryAfter=
389 norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr;
390
391 if(norm.combinesBack) {
392 if(norm.compositions!=nullptr) {
393 // Earlier code checked ccc=0.
394 norm.type=Norm::MAYBE_YES_COMBINES_FWD;
729e4ab9 395 } else {
0f5d89e8 396 norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc
729e4ab9 397 }
0f5d89e8
A
398 } else if(norm.compositions!=nullptr) {
399 // Earlier code checked ccc=0.
400 norm.type=Norm::YES_YES_COMBINES_FWD;
401 } else if(norm.cc!=0) {
402 norm.type=Norm::YES_YES_WITH_CC;
403 } else {
404 norm.type=Norm::INERT;
729e4ab9
A
405 }
406 }
407}
408
0f5d89e8 409class Norm16Writer : public Norms::Enumerator {
729e4ab9 410public:
3d1f044b
A
411 Norm16Writer(UMutableCPTrie *trie, Norms &n, Normalizer2DataBuilder &b) :
412 Norms::Enumerator(n), builder(b), norm16Trie(trie) {}
0f5d89e8 413 void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE {
3d1f044b 414 builder.writeNorm16(norm16Trie, start, end, norm);
729e4ab9 415 }
0f5d89e8 416 Normalizer2DataBuilder &builder;
3d1f044b 417 UMutableCPTrie *norm16Trie;
729e4ab9
A
418};
419
0f5d89e8
A
420void Normalizer2DataBuilder::setSmallFCD(UChar32 c) {
421 UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
422 smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
423}
424
3d1f044b 425void Normalizer2DataBuilder::writeNorm16(UMutableCPTrie *norm16Trie, UChar32 start, UChar32 end, Norm &norm) {
0f5d89e8
A
426 if((norm.leadCC|norm.trailCC)!=0) {
427 for(UChar32 c=start; c<=end; ++c) {
428 setSmallFCD(c);
429 }
430 }
431
432 int32_t norm16;
433 switch(norm.type) {
434 case Norm::INERT:
435 norm16=Normalizer2Impl::INERT;
436 break;
437 case Norm::YES_YES_COMBINES_FWD:
438 norm16=norm.offset*2;
439 break;
440 case Norm::YES_NO_COMBINES_FWD:
441 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2;
442 break;
443 case Norm::YES_NO_MAPPING_ONLY:
444 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2;
445 break;
446 case Norm::NO_NO_COMP_YES:
447 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2;
448 break;
449 case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
450 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2;
451 break;
452 case Norm::NO_NO_COMP_NO_MAYBE_CC:
453 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2;
454 break;
455 case Norm::NO_NO_EMPTY:
456 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2;
457 break;
458 case Norm::NO_NO_DELTA:
459 {
460 // Positive offset from minNoNoDelta, shifted left for additional bits.
461 int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT;
462 if(norm.trailCC==0) {
463 // DELTA_TCCC_0==0
464 } else if(norm.trailCC==1) {
465 offset|=Normalizer2Impl::DELTA_TCCC_1;
466 } else {
467 offset|=Normalizer2Impl::DELTA_TCCC_GT_1;
729e4ab9 468 }
0f5d89e8 469 norm16=getMinNoNoDelta()+offset;
729e4ab9 470 break;
729e4ab9 471 }
0f5d89e8
A
472 case Norm::MAYBE_YES_COMBINES_FWD:
473 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2;
474 break;
475 case Norm::MAYBE_YES_SIMPLE:
476 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255
477 break;
478 case Norm::YES_YES_WITH_CC:
479 U_ASSERT(norm.cc!=0);
480 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255
481 break;
482 default: // Should not occur.
483 exit(U_INTERNAL_PROGRAM_ERROR);
484 }
485 U_ASSERT((norm16&1)==0);
486 if(norm.hasCompBoundaryAfter) {
487 norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
488 }
489 IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
3d1f044b 490 umutablecptrie_setRange(norm16Trie, start, end, (uint32_t)norm16, errorCode);
0f5d89e8
A
491
492 // Set the minimum code points for real data lookups in the quick check loops.
493 UBool isDecompNo=
494 (Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) ||
495 norm.cc!=0;
496 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
497 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
498 }
499 UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES;
500 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
501 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
502 }
503 if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) {
504 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start;
729e4ab9
A
505 }
506}
507
3d1f044b 508void Normalizer2DataBuilder::setHangulData(UMutableCPTrie *norm16Trie) {
729e4ab9
A
509 HangulIterator hi;
510 const HangulIterator::Range *range;
511 // Check that none of the Hangul/Jamo code points have data.
512 while((range=hi.nextRange())!=NULL) {
0f5d89e8 513 for(UChar32 c=range->start; c<=range->end; ++c) {
3d1f044b 514 if(umutablecptrie_get(norm16Trie, c)>Normalizer2Impl::INERT) {
729e4ab9
A
515 fprintf(stderr,
516 "gennorm2 error: "
517 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
518 (long)c);
519 exit(U_INVALID_FORMAT_ERROR);
520 }
521 }
522 }
523 // Set data for algorithmic runtime handling.
524 IcuToolErrorCode errorCode("gennorm2/setHangulData()");
0f5d89e8
A
525
526 // Jamo V/T are maybeYes
527 if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
528 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE;
729e4ab9 529 }
3d1f044b
A
530 umutablecptrie_setRange(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END,
531 Normalizer2Impl::JAMO_L, errorCode);
532 umutablecptrie_setRange(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END,
533 Normalizer2Impl::JAMO_VT, errorCode);
0f5d89e8 534 // JAMO_T_BASE+1: not U+11A7
3d1f044b
A
535 umutablecptrie_setRange(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END,
536 Normalizer2Impl::JAMO_VT, errorCode);
0f5d89e8
A
537
538 // Hangul LV encoded as minYesNo
539 uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO];
540 // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER
541 uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]|
542 Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
543 if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
544 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE;
545 }
546 // Set the first LV, then write all other Hangul syllables as LVT,
547 // then overwrite the remaining LV.
3d1f044b
A
548 umutablecptrie_set(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode);
549 umutablecptrie_setRange(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END, lvt, errorCode);
0f5d89e8
A
550 UChar32 c=Hangul::HANGUL_BASE;
551 while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) {
3d1f044b 552 umutablecptrie_set(norm16Trie, c, lv, errorCode);
0f5d89e8
A
553 }
554 errorCode.assertSuccess();
729e4ab9
A
555}
556
3d1f044b 557LocalUCPTriePointer Normalizer2DataBuilder::processData() {
0f5d89e8
A
558 // Build composition lists before recursive decomposition,
559 // so that we still have the raw, pair-wise mappings.
560 CompositionBuilder compBuilder(norms);
561 norms.enumRanges(compBuilder);
729e4ab9 562
0f5d89e8
A
563 // Recursively decompose all mappings.
564 Decomposer decomposer(norms);
729e4ab9
A
565 do {
566 decomposer.didDecompose=FALSE;
0f5d89e8 567 norms.enumRanges(decomposer);
729e4ab9
A
568 } while(decomposer.didDecompose);
569
0f5d89e8
A
570 // Set the Norm::Type and other properties.
571 int32_t normsLength=norms.length();
729e4ab9 572 for(int32_t i=1; i<normsLength; ++i) {
0f5d89e8 573 postProcess(norms.getNormRefByIndex(i));
729e4ab9
A
574 }
575
0f5d89e8
A
576 // Write the properties, mappings and composition lists to
577 // appropriate parts of the "extra data" array.
578 ExtraData extra(norms, optimization==OPTIMIZE_FAST);
579 norms.enumRanges(extra);
580
581 extraData=extra.yesYesCompositions;
582 indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2;
583 extraData.append(extra.yesNoMappingsAndCompositions);
584 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2;
585 extraData.append(extra.yesNoMappingsOnly);
586 indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2;
587 extraData.append(extra.noNoMappingsCompYes);
588 indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2;
589 extraData.append(extra.noNoMappingsCompBoundaryBefore);
590 indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2;
591 extraData.append(extra.noNoMappingsCompNoMaybeCC);
592 indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2;
593 extraData.append(extra.noNoMappingsEmpty);
594 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2;
595
596 // Pad the maybeYesCompositions length to a multiple of 4,
597 // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
598 while(extra.maybeYesCompositions.length()&3) {
599 extra.maybeYesCompositions.append((UChar)0);
600 }
601 extraData.insert(0, extra.maybeYesCompositions);
602 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
603 Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
604 extra.maybeYesCompositions.length()*2;
729e4ab9 605
729e4ab9
A
606 // Pad to even length for 4-byte alignment of following data.
607 if(extraData.length()&1) {
608 extraData.append((UChar)0);
609 }
610
0f5d89e8
A
611 int32_t minNoNoDelta=getMinNoNoDelta();
612 U_ASSERT((minNoNoDelta&7)==0);
729e4ab9
A
613 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
614 fprintf(stderr,
615 "gennorm2 error: "
616 "data structure overflow, too much mapping composition data\n");
617 exit(U_BUFFER_OVERFLOW_ERROR);
618 }
619
0f5d89e8
A
620 // writeNorm16() and setHangulData() reduce these as needed.
621 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
622 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
623 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000;
624
3d1f044b
A
625 IcuToolErrorCode errorCode("gennorm2/processData()");
626 UMutableCPTrie *norm16Trie = umutablecptrie_open(
627 Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode);
628 errorCode.assertSuccess();
629
0f5d89e8
A
630 // Map each code point to its norm16 value,
631 // including the properties that fit directly,
632 // and the offset to the "extra data" if necessary.
3d1f044b 633 Norm16Writer norm16Writer(norm16Trie, norms, *this);
0f5d89e8 634 norms.enumRanges(norm16Writer);
3d1f044b 635 // TODO: iterate via getRange() instead of callback?
729e4ab9 636
3d1f044b 637 setHangulData(norm16Trie);
729e4ab9
A
638
639 // Look for the "worst" norm16 value of any supplementary code point
640 // corresponding to a lead surrogate, and set it as that surrogate's value.
0f5d89e8 641 // Enables UTF-16 quick check inner loops to look at only code units.
729e4ab9
A
642 //
643 // We could be more sophisticated:
644 // We could collect a bit set for whether there are values in the different
645 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
646 // and select the best value that only breaks the composition and/or decomposition
647 // inner loops if necessary.
648 // However, that seems like overkill for an optimization for supplementary characters.
3d1f044b
A
649 //
650 // First check that surrogate code *points* are inert.
651 // The parser should have rejected values/mappings for them.
652 uint32_t value;
653 UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0,
654 nullptr, nullptr, &value);
655 if (value != Normalizer2Impl::INERT || end < 0xdfff) {
656 fprintf(stderr,
657 "gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n",
658 (int)end, (long)value);
659 exit(U_INTERNAL_PROGRAM_ERROR);
660 }
661 uint32_t maxNorm16 = 0;
662 // ANDing values yields 0 bits where any value has a 0.
663 // Used for worst-case HAS_COMP_BOUNDARY_AFTER.
664 uint32_t andedNorm16 = 0;
665 end = 0;
666 for (UChar32 start = 0x10000;;) {
667 if (start > end) {
668 end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0,
669 nullptr, nullptr, &value);
670 if (end < 0) { break; }
671 }
672 if ((start & 0x3ff) == 0) {
673 // Data for a new lead surrogate.
674 maxNorm16 = andedNorm16 = value;
675 } else {
676 if (value > maxNorm16) {
677 maxNorm16 = value;
678 }
679 andedNorm16 &= value;
680 }
681 // Intersect each range with the code points for one lead surrogate.
682 UChar32 leadEnd = start | 0x3ff;
683 if (leadEnd <= end) {
684 // End of the supplementary block for a lead surrogate.
685 if (maxNorm16 >= (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]) {
686 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
687 // Otherwise it might end up at something like JAMO_VT which stays in
688 // the inner decomposition quick check loop.
689 maxNorm16 = (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO];
690 }
691 maxNorm16 =
692 (maxNorm16 & ~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)|
693 (andedNorm16 & Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER);
694 if (maxNorm16 != Normalizer2Impl::INERT) {
695 umutablecptrie_set(norm16Trie, U16_LEAD(start), maxNorm16, errorCode);
696 }
697 if (value == Normalizer2Impl::INERT) {
698 // Potentially skip inert supplementary blocks for several lead surrogates.
699 start = (end + 1) & ~0x3ff;
700 } else {
701 start = leadEnd + 1;
702 }
703 } else {
704 start = end + 1;
729e4ab9 705 }
729e4ab9
A
706 }
707
708 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
709 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
710 // which is harmless.
711 // As a result, the minimum code points are always BMP code points.
712 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
713 if(minCP>=0x10000) {
714 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
715 }
716 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
717 if(minCP>=0x10000) {
718 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
719 }
0f5d89e8
A
720 minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP];
721 if(minCP>=0x10000) {
722 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP);
723 }
729e4ab9 724
3d1f044b
A
725 LocalUCPTriePointer builtTrie(
726 umutablecptrie_buildImmutable(norm16Trie, UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_16, errorCode));
727 norm16TrieLength=ucptrie_toBinary(builtTrie.getAlias(), nullptr, 0, errorCode);
729e4ab9 728 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
3d1f044b 729 fprintf(stderr, "gennorm2 error: unable to build/serialize the normalization trie - %s\n",
729e4ab9
A
730 errorCode.errorName());
731 exit(errorCode.reset());
732 }
3d1f044b 733 umutablecptrie_close(norm16Trie);
729e4ab9 734 errorCode.reset();
3d1f044b
A
735 norm16TrieBytes=new uint8_t[norm16TrieLength];
736 ucptrie_toBinary(builtTrie.getAlias(), norm16TrieBytes, norm16TrieLength, errorCode);
737 errorCode.assertSuccess();
729e4ab9
A
738
739 int32_t offset=(int32_t)sizeof(indexes);
740 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
741 offset+=norm16TrieLength;
742 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
4388f060
A
743 offset+=extraData.length()*2;
744 indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
745 offset+=sizeof(smallFCD);
746 int32_t totalSize=offset;
747 for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
729e4ab9
A
748 indexes[i]=totalSize;
749 }
750
751 if(beVerbose) {
752 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength);
753 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length());
4388f060 754 printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD));
729e4ab9
A
755 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize);
756 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
757 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
0f5d89e8
A
758 printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]);
759 printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
4388f060 760 printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
0f5d89e8
A
761 printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
762 printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
763 printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
764 printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]);
729e4ab9 765 printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
0f5d89e8 766 printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta);
729e4ab9
A
767 printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
768 }
769
4388f060
A
770 UVersionInfo nullVersion={ 0, 0, 0, 0 };
771 if(0==memcmp(nullVersion, unicodeVersion, 4)) {
772 u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
773 }
729e4ab9 774 memcpy(dataInfo.dataVersion, unicodeVersion, 4);
3d1f044b 775 return builtTrie;
b331163b
A
776}
777
778void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
779 processData();
780
781 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
729e4ab9
A
782 UNewDataMemory *pData=
783 udata_create(NULL, NULL, filename, &dataInfo,
784 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
785 if(errorCode.isFailure()) {
786 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
787 filename, errorCode.errorName());
788 exit(errorCode.reset());
789 }
790 udata_writeBlock(pData, indexes, sizeof(indexes));
3d1f044b 791 udata_writeBlock(pData, norm16TrieBytes, norm16TrieLength);
f3c0d7a5 792 udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length());
4388f060 793 udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
729e4ab9
A
794 int32_t writtenSize=udata_finish(pData, errorCode);
795 if(errorCode.isFailure()) {
796 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
797 exit(errorCode.reset());
798 }
b331163b 799 int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
729e4ab9
A
800 if(writtenSize!=totalSize) {
801 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
802 (long)writtenSize, (long)totalSize);
803 exit(U_INTERNAL_PROGRAM_ERROR);
804 }
805}
806
b331163b
A
807void
808Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
3d1f044b 809 LocalUCPTriePointer norm16Trie = processData();
b331163b
A
810
811 IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()");
812 const char *basename=findBasename(filename);
813 CharString path(filename, (int32_t)(basename-filename), errorCode);
814 CharString dataName(basename, errorCode);
815 const char *extension=strrchr(basename, '.');
816 if(extension!=NULL) {
817 dataName.truncate((int32_t)(extension-basename));
818 }
3d1f044b 819 const char *name=dataName.data();
b331163b
A
820 errorCode.assertSuccess();
821
3d1f044b 822 FILE *f=usrc_create(path.data(), basename, 2016, "icu/source/tools/gennorm2/n2builder.cpp");
b331163b
A
823 if(f==NULL) {
824 fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
825 filename);
826 exit(U_FILE_ACCESS_ERROR);
b331163b 827 }
f3c0d7a5 828 fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f);
3d1f044b 829
b331163b 830 char line[100];
3d1f044b 831 sprintf(line, "static const UVersionInfo %s_formatVersion={", name);
b331163b 832 usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n");
3d1f044b 833 sprintf(line, "static const UVersionInfo %s_dataVersion={", name);
b331163b 834 usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n");
3d1f044b
A
835 sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", name);
836 usrc_writeArray(f, line, indexes, 32, Normalizer2Impl::IX_COUNT, "\n};\n\n");
837
838 usrc_writeUCPTrie(f, name, norm16Trie.getAlias());
839
840 sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", name);
841 usrc_writeArray(f, line, extraData.getBuffer(), 16, extraData.length(), "\n};\n\n");
842 sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", name);
843 usrc_writeArray(f, line, smallFCD, 8, sizeof(smallFCD), "\n};\n\n");
844
845 fputs("#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f);
b331163b
A
846 fclose(f);
847}
848
0f5d89e8
A
849namespace {
850
851bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
852 if(s1 == nullptr) {
853 return s2 == nullptr;
854 } else if(s2 == nullptr) {
855 return false;
856 } else {
857 return *s1 == *s2;
858 }
859}
860
861const char *typeChars = "?-=>";
862
863void writeMapping(FILE *f, const UnicodeString *m) {
864 if(m != nullptr && !m->isEmpty()) {
865 int32_t i = 0;
866 UChar32 c = m->char32At(i);
867 fprintf(f, "%04lX", (long)c);
868 while((i += U16_LENGTH(c)) < m->length()) {
869 c = m->char32At(i);
870 fprintf(f, " %04lX", (long)c);
871 }
872 }
873 fputs("\n", f);
874}
875
876} // namespace
877
878void
879Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
880 // Do not processData() before writing the input-syntax data file.
881 FILE *f = fopen(filename, "w");
882 if(f == nullptr) {
883 fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
884 filename);
885 exit(U_FILE_ACCESS_ERROR);
886 return;
887 }
888
889 if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
890 unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
891 char uv[U_MAX_VERSION_STRING_LENGTH];
892 u_versionToString(unicodeVersion, uv);
893 fprintf(f, "* Unicode %s\n\n", uv);
894 }
895
896 UnicodeSetIterator ccIter(norms.ccSet);
897 UChar32 start = U_SENTINEL;
898 UChar32 end = U_SENTINEL;
899 uint8_t prevCC = 0;
900 bool done = false;
901 bool didWrite = false;
902 do {
903 UChar32 c;
904 uint8_t cc;
905 if(ccIter.next() && !ccIter.isString()) {
906 c = ccIter.getCodepoint();
907 cc = norms.getCC(c);
908 } else {
909 c = 0x110000;
910 cc = 0;
911 done = true;
912 }
913 if(cc == prevCC && c == (end + 1)) {
914 end = c;
915 } else {
916 if(prevCC != 0) {
917 if(start == end) {
918 fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
919 } else {
920 fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
921 }
922 didWrite = true;
923 }
924 start = end = c;
925 prevCC = cc;
926 }
927 } while(!done);
928 if(didWrite) {
929 fputs("\n", f);
930 }
931
932 UnicodeSetIterator mIter(norms.mappingSet);
933 start = U_SENTINEL;
934 end = U_SENTINEL;
935 const UnicodeString *prevMapping = nullptr;
936 Norm::MappingType prevType = Norm::NONE;
937 done = false;
938 do {
939 UChar32 c;
940 const Norm *norm;
941 if(mIter.next() && !mIter.isString()) {
942 c = mIter.getCodepoint();
943 norm = norms.getNorm(c);
944 } else {
945 c = 0x110000;
946 norm = nullptr;
947 done = true;
948 }
949 const UnicodeString *mapping;
950 Norm::MappingType type;
951 if(norm == nullptr) {
952 mapping = nullptr;
953 type = Norm::NONE;
954 } else {
955 type = norm->mappingType;
956 if(type == Norm::NONE) {
957 mapping = nullptr;
958 } else {
959 mapping = norm->mapping;
960 }
961 }
962 if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
963 end = c;
964 } else {
965 if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
966 if(start == end) {
967 fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
968 } else {
969 fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
970 }
971 writeMapping(f, prevMapping);
972 }
973 start = end = c;
974 prevMapping = mapping;
975 prevType = type;
976 }
977 } while(!done);
978
979 fclose(f);
980}
981
982void
983Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
984 const Normalizer2DataBuilder &b2,
985 Normalizer2DataBuilder &diff) {
986 // Compute diff = b1 - b2
987 // so that we should be able to get b1 = b2 + diff.
988 if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
989 memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
990 }
991
992 UnicodeSet ccSet(b1.norms.ccSet);
993 ccSet.addAll(b2.norms.ccSet);
994 UnicodeSetIterator ccIter(ccSet);
995 while(ccIter.next() && !ccIter.isString()) {
996 UChar32 c = ccIter.getCodepoint();
997 uint8_t cc1 = b1.norms.getCC(c);
998 uint8_t cc2 = b2.norms.getCC(c);
999 if(cc1 != cc2) {
1000 diff.setCC(c, cc1);
1001 }
1002 }
1003
1004 UnicodeSet mSet(b1.norms.mappingSet);
1005 mSet.addAll(b2.norms.mappingSet);
1006 UnicodeSetIterator mIter(mSet);
1007 while(mIter.next() && !mIter.isString()) {
1008 UChar32 c = mIter.getCodepoint();
1009 const Norm *norm1 = b1.norms.getNorm(c);
1010 const Norm *norm2 = b2.norms.getNorm(c);
1011 const UnicodeString *mapping1;
1012 Norm::MappingType type1;
1013 if(norm1 == nullptr || !norm1->hasMapping()) {
1014 mapping1 = nullptr;
1015 type1 = Norm::NONE;
1016 } else {
1017 mapping1 = norm1->mapping;
1018 type1 = norm1->mappingType;
1019 }
1020 const UnicodeString *mapping2;
1021 Norm::MappingType type2;
1022 if(norm2 == nullptr || !norm2->hasMapping()) {
1023 mapping2 = nullptr;
1024 type2 = Norm::NONE;
1025 } else {
1026 mapping2 = norm2->mapping;
1027 type2 = norm2->mappingType;
1028 }
1029 if(type1 == type2 && equalStrings(mapping1, mapping2)) {
1030 // Nothing to do.
1031 } else if(type1 == Norm::NONE) {
1032 diff.removeMapping(c);
1033 } else if(type1 == Norm::ROUND_TRIP) {
1034 diff.setRoundTripMapping(c, *mapping1);
1035 } else if(type1 == Norm::ONE_WAY) {
1036 diff.setOneWayMapping(c, *mapping1);
1037 }
1038 }
1039}
1040
729e4ab9
A
1041U_NAMESPACE_END
1042
1043#endif /* #if !UCONFIG_NO_NORMALIZATION */
1044
1045/*
1046 * Hey, Emacs, please set the following:
1047 *
1048 * Local Variables:
1049 * indent-tabs-mode: nil
1050 * End:
1051 */