1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: n2builder.cpp
12 * tab size: 8 (not used)
15 * created on: 2009nov25
16 * created by: Markus W. Scherer
18 * Builds Normalizer2 data and writes a binary .nrm file.
19 * For the file format see source/common/normalizer2impl.h.
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
29 #include "unicode/errorcode.h"
30 #include "unicode/localpointer.h"
31 #include "unicode/putil.h"
32 #include "unicode/udata.h"
33 #include "unicode/uniset.h"
34 #include "unicode/unistr.h"
35 #include "unicode/usetiter.h"
36 #include "unicode/ustring.h"
38 #include "extradata.h"
40 #include "normalizer2impl.h"
48 #if !UCONFIG_NO_NORMALIZATION
50 /* UDataInfo cf. udata.h */
51 static UDataInfo dataInfo
={
60 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
61 { 3, 0, 0, 0 }, /* formatVersion */
62 { 10, 0, 0, 0 } /* dataVersion (Unicode version) */
67 class HangulIterator
{
73 HangulIterator() : rangeIndex(0) {}
74 const Range
*nextRange() {
75 if(rangeIndex
<UPRV_LENGTHOF(ranges
)) {
76 return ranges
+rangeIndex
++;
82 static const Range ranges
[4];
86 const HangulIterator::Range
HangulIterator::ranges
[4]={
87 { Hangul::JAMO_L_BASE
, Hangul::JAMO_L_END
},
88 { Hangul::JAMO_V_BASE
, Hangul::JAMO_V_END
},
89 // JAMO_T_BASE+1: not U+11A7
90 { Hangul::JAMO_T_BASE
+1, Hangul::JAMO_T_END
},
91 { Hangul::HANGUL_BASE
, Hangul::HANGUL_END
},
94 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode
&errorCode
) :
96 phase(0), overrideHandling(OVERRIDE_PREVIOUS
), optimization(OPTIMIZE_NORMAL
),
97 norm16Trie(nullptr), norm16TrieLength(0) {
98 memset(unicodeVersion
, 0, sizeof(unicodeVersion
));
99 memset(indexes
, 0, sizeof(indexes
));
100 memset(smallFCD
, 0, sizeof(smallFCD
));
103 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
104 utrie2_close(norm16Trie
);
108 Normalizer2DataBuilder::setUnicodeVersion(const char *v
) {
109 UVersionInfo nullVersion
={ 0, 0, 0, 0 };
110 UVersionInfo version
;
111 u_versionFromString(version
, v
);
112 if( 0!=memcmp(version
, unicodeVersion
, U_MAX_VERSION_LENGTH
) &&
113 0!=memcmp(nullVersion
, unicodeVersion
, U_MAX_VERSION_LENGTH
)
115 char buffer
[U_MAX_VERSION_STRING_LENGTH
];
116 u_versionToString(unicodeVersion
, buffer
);
117 fprintf(stderr
, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
119 exit(U_ILLEGAL_ARGUMENT_ERROR
);
121 memcpy(unicodeVersion
, version
, U_MAX_VERSION_LENGTH
);
124 Norm
*Normalizer2DataBuilder::checkNormForMapping(Norm
*p
, UChar32 c
) {
126 if(p
->mappingType
!=Norm::NONE
) {
127 if( overrideHandling
==OVERRIDE_NONE
||
128 (overrideHandling
==OVERRIDE_PREVIOUS
&& p
->mappingPhase
==phase
)
131 "error in gennorm2 phase %d: "
132 "not permitted to override mapping for U+%04lX from phase %d\n",
133 (int)phase
, (long)c
, (int)p
->mappingPhase
);
134 exit(U_INVALID_FORMAT_ERROR
);
139 p
->mappingPhase
=phase
;
144 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh
) {
149 void Normalizer2DataBuilder::setCC(UChar32 c
, uint8_t cc
) {
150 norms
.createNorm(c
)->cc
=cc
;
154 static UBool
isWellFormed(const UnicodeString
&s
) {
155 UErrorCode errorCode
=U_ZERO_ERROR
;
156 u_strToUTF8(NULL
, 0, NULL
, toUCharPtr(s
.getBuffer()), s
.length(), &errorCode
);
157 return U_SUCCESS(errorCode
) || errorCode
==U_BUFFER_OVERFLOW_ERROR
;
160 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c
, const UnicodeString
&m
) {
161 if(!isWellFormed(m
)) {
163 "error in gennorm2 phase %d: "
164 "illegal one-way mapping from U+%04lX to malformed string\n",
165 (int)phase
, (long)c
);
166 exit(U_INVALID_FORMAT_ERROR
);
168 Norm
*p
=checkNormForMapping(norms
.createNorm(c
), c
);
169 p
->mapping
=new UnicodeString(m
);
170 p
->mappingType
=Norm::ONE_WAY
;
172 norms
.mappingSet
.add(c
);
175 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c
, const UnicodeString
&m
) {
176 if(U_IS_SURROGATE(c
)) {
178 "error in gennorm2 phase %d: "
179 "illegal round-trip mapping from surrogate code point U+%04lX\n",
180 (int)phase
, (long)c
);
181 exit(U_INVALID_FORMAT_ERROR
);
183 if(!isWellFormed(m
)) {
185 "error in gennorm2 phase %d: "
186 "illegal round-trip mapping from U+%04lX to malformed string\n",
187 (int)phase
, (long)c
);
188 exit(U_INVALID_FORMAT_ERROR
);
190 int32_t numCP
=u_countChar32(toUCharPtr(m
.getBuffer()), m
.length());
193 "error in gennorm2 phase %d: "
194 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
195 (int)phase
, (long)c
, (int)numCP
);
196 exit(U_INVALID_FORMAT_ERROR
);
198 Norm
*p
=checkNormForMapping(norms
.createNorm(c
), c
);
199 p
->mapping
=new UnicodeString(m
);
200 p
->mappingType
=Norm::ROUND_TRIP
;
201 p
->mappingCP
=U_SENTINEL
;
202 norms
.mappingSet
.add(c
);
205 void Normalizer2DataBuilder::removeMapping(UChar32 c
) {
206 // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
207 Norm
*p
=checkNormForMapping(norms
.createNorm(c
), c
);
208 p
->mappingType
=Norm::REMOVED
;
209 norms
.mappingSet
.add(c
);
212 UBool
Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer
&buffer
,
213 Norm::MappingType mappingType
) const {
214 if(buffer
.isEmpty()) {
215 return FALSE
; // Maps-to-empty-string is no boundary of any kind.
217 int32_t lastStarterIndex
=buffer
.lastStarterIndex();
218 if(lastStarterIndex
<0) {
219 return FALSE
; // no starter
221 const int32_t lastIndex
=buffer
.length()-1;
222 if(mappingType
==Norm::ONE_WAY
&& lastStarterIndex
<lastIndex
&& buffer
.ccAt(lastIndex
)>1) {
223 // One-way mapping where after the last starter is at least one combining mark
224 // with a combining class greater than 1,
225 // which means that another combining mark can reorder before it.
226 // By contrast, in a round-trip mapping this does not prevent a boundary as long as
227 // the starter or composite does not combine-forward with a following combining mark.
230 UChar32 starter
=buffer
.charAt(lastStarterIndex
);
231 if(lastStarterIndex
==0 && norms
.combinesBack(starter
)) {
232 // The last starter is at the beginning of the mapping and combines backward.
235 if(Hangul::isJamoL(starter
) ||
236 (Hangul::isJamoV(starter
) &&
237 0<lastStarterIndex
&& Hangul::isJamoL(buffer
.charAt(lastStarterIndex
-1)))) {
238 // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
239 // otherwise it is blocked.
240 return lastStarterIndex
!=lastIndex
;
242 // Note: There can be no Hangul syllable in the fully decomposed mapping.
244 // Multiple starters can combine into one.
245 // Look for the first of the last sequence of starters, excluding Jamos.
246 int32_t i
=lastStarterIndex
;
248 while(0<i
&& buffer
.ccAt(i
-1)==0 && !Hangul::isJamo(c
=buffer
.charAt(i
-1))) {
252 // Compose as far as possible, and see if further compositions with
253 // characters following this mapping are possible.
254 const Norm
*starterNorm
=norms
.getNorm(starter
);
255 if(i
==lastStarterIndex
&&
256 (starterNorm
==nullptr || starterNorm
->compositions
==nullptr)) {
257 return TRUE
; // The last starter does not combine forward.
260 while(++i
<buffer
.length()) {
261 uint8_t cc
=buffer
.ccAt(i
); // !=0 if after last starter
262 if(i
>lastStarterIndex
&& norms
.combinesWithCCBetween(*starterNorm
, prevCC
, cc
)) {
263 // The starter combines with a mark that reorders before the current one.
266 UChar32 c
=buffer
.charAt(i
);
267 if(starterNorm
!=nullptr && (prevCC
<cc
|| prevCC
==0) &&
268 norms
.getNormRef(c
).combinesBack
&& (starter
=starterNorm
->combine(c
))>=0) {
269 // The starter combines with c into a composite replacement starter.
270 starterNorm
=norms
.getNorm(starter
);
271 if(i
>=lastStarterIndex
&&
272 (starterNorm
==nullptr || starterNorm
->compositions
==nullptr)) {
273 return TRUE
; // The composite does not combine further.
275 // Keep prevCC because we "removed" the combining mark.
277 starterNorm
=norms
.getNorm(c
);
278 if(i
==lastStarterIndex
&&
279 (starterNorm
==nullptr || starterNorm
->compositions
==nullptr)) {
280 return TRUE
; // The new starter does not combine forward.
288 return FALSE
; // forward-combining starter at the very end
290 if(norms
.combinesWithCCBetween(*starterNorm
, prevCC
, 256)) {
291 // The starter combines with another mark.
297 UBool
Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer
&buffer
) const {
298 if(buffer
.lastStarterIndex()<0) {
299 return FALSE
; // no starter
301 const Norm
*starterNorm
=nullptr;
303 for(int32_t i
=0; i
<buffer
.length(); ++i
) {
304 UChar32 c
=buffer
.charAt(i
);
305 uint8_t cc
=buffer
.ccAt(i
);
306 if(starterNorm
!=nullptr && (prevCC
<cc
|| prevCC
==0) &&
307 norms
.getNormRef(c
).combinesBack
&& starterNorm
->combine(c
)>=0) {
308 return TRUE
; // normal composite
310 if(Hangul::isJamoL(c
)) {
311 if((i
+1)<buffer
.length() && Hangul::isJamoV(buffer
.charAt(i
+1))) {
312 return TRUE
; // Hangul syllable
316 starterNorm
=norms
.getNorm(c
);
324 void Normalizer2DataBuilder::postProcess(Norm
&norm
) {
325 // Prerequisites: Compositions are built, mappings are recursively decomposed.
326 // Mappings are not yet in canonical order.
328 // This function works on a Norm struct. We do not know which code point(s) map(s) to it.
329 // Therefore, we cannot compute algorithmic mapping deltas here.
330 // Error conditions are checked, but printed later when we do know the offending code point.
331 if(norm
.hasMapping()) {
332 if(norm
.mapping
->length()>Normalizer2Impl::MAPPING_LENGTH_MASK
) {
333 norm
.error
="mapping longer than maximum of 31";
336 // Ensure canonical order.
337 BuilderReorderingBuffer buffer
;
338 if(norm
.rawMapping
!=nullptr) {
339 norms
.reorder(*norm
.rawMapping
, buffer
);
342 norms
.reorder(*norm
.mapping
, buffer
);
343 if(buffer
.isEmpty()) {
344 // A character that is deleted (maps to an empty string) must
345 // get the worst-case lccc and tccc values because arbitrary
346 // characters on both sides will become adjacent.
350 norm
.leadCC
=buffer
.ccAt(0);
351 norm
.trailCC
=buffer
.ccAt(buffer
.length()-1);
354 norm
.hasCompBoundaryBefore
=
355 !buffer
.isEmpty() && norm
.leadCC
==0 && !norms
.combinesBack(buffer
.charAt(0));
356 norm
.hasCompBoundaryAfter
=
357 norm
.compositions
==nullptr && mappingHasCompBoundaryAfter(buffer
, norm
.mappingType
);
359 if(norm
.combinesBack
) {
360 norm
.error
="combines-back and decomposes, not possible in Unicode normalization";
361 } else if(norm
.mappingType
==Norm::ROUND_TRIP
) {
362 if(norm
.compositions
!=NULL
) {
363 norm
.type
=Norm::YES_NO_COMBINES_FWD
;
365 norm
.type
=Norm::YES_NO_MAPPING_ONLY
;
367 } else { // one-way mapping
368 if(norm
.compositions
!=NULL
) {
369 norm
.error
="combines-forward and has a one-way mapping, "
370 "not possible in Unicode normalization";
371 } else if(buffer
.isEmpty()) {
372 norm
.type
=Norm::NO_NO_EMPTY
;
373 } else if(!norm
.hasCompBoundaryBefore
) {
374 norm
.type
=Norm::NO_NO_COMP_NO_MAYBE_CC
;
375 } else if(mappingRecomposes(buffer
)) {
376 norm
.type
=Norm::NO_NO_COMP_BOUNDARY_BEFORE
;
378 // The mapping is comp-normalized.
379 norm
.type
=Norm::NO_NO_COMP_YES
;
382 } else { // no mapping
383 norm
.leadCC
=norm
.trailCC
=norm
.cc
;
385 norm
.hasCompBoundaryBefore
=
386 norm
.cc
==0 && !norm
.combinesBack
;
387 norm
.hasCompBoundaryAfter
=
388 norm
.cc
==0 && !norm
.combinesBack
&& norm
.compositions
==nullptr;
390 if(norm
.combinesBack
) {
391 if(norm
.compositions
!=nullptr) {
392 // Earlier code checked ccc=0.
393 norm
.type
=Norm::MAYBE_YES_COMBINES_FWD
;
395 norm
.type
=Norm::MAYBE_YES_SIMPLE
; // any ccc
397 } else if(norm
.compositions
!=nullptr) {
398 // Earlier code checked ccc=0.
399 norm
.type
=Norm::YES_YES_COMBINES_FWD
;
400 } else if(norm
.cc
!=0) {
401 norm
.type
=Norm::YES_YES_WITH_CC
;
403 norm
.type
=Norm::INERT
;
408 class Norm16Writer
: public Norms::Enumerator
{
410 Norm16Writer(Norms
&n
, Normalizer2DataBuilder
&b
) : Norms::Enumerator(n
), builder(b
) {}
411 void rangeHandler(UChar32 start
, UChar32 end
, Norm
&norm
) U_OVERRIDE
{
412 builder
.writeNorm16(start
, end
, norm
);
414 Normalizer2DataBuilder
&builder
;
417 void Normalizer2DataBuilder::setSmallFCD(UChar32 c
) {
418 UChar32 lead
= c
<=0xffff ? c
: U16_LEAD(c
);
419 smallFCD
[lead
>>8]|=(uint8_t)1<<((lead
>>5)&7);
422 void Normalizer2DataBuilder::writeNorm16(UChar32 start
, UChar32 end
, Norm
&norm
) {
423 if((norm
.leadCC
|norm
.trailCC
)!=0) {
424 for(UChar32 c
=start
; c
<=end
; ++c
) {
432 norm16
=Normalizer2Impl::INERT
;
434 case Norm::YES_YES_COMBINES_FWD
:
435 norm16
=norm
.offset
*2;
437 case Norm::YES_NO_COMBINES_FWD
:
438 norm16
=indexes
[Normalizer2Impl::IX_MIN_YES_NO
]+norm
.offset
*2;
440 case Norm::YES_NO_MAPPING_ONLY
:
441 norm16
=indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]+norm
.offset
*2;
443 case Norm::NO_NO_COMP_YES
:
444 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO
]+norm
.offset
*2;
446 case Norm::NO_NO_COMP_BOUNDARY_BEFORE
:
447 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE
]+norm
.offset
*2;
449 case Norm::NO_NO_COMP_NO_MAYBE_CC
:
450 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC
]+norm
.offset
*2;
452 case Norm::NO_NO_EMPTY
:
453 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO_EMPTY
]+norm
.offset
*2;
455 case Norm::NO_NO_DELTA
:
457 // Positive offset from minNoNoDelta, shifted left for additional bits.
458 int32_t offset
=(norm
.offset
+Normalizer2Impl::MAX_DELTA
)<<Normalizer2Impl::DELTA_SHIFT
;
459 if(norm
.trailCC
==0) {
461 } else if(norm
.trailCC
==1) {
462 offset
|=Normalizer2Impl::DELTA_TCCC_1
;
464 offset
|=Normalizer2Impl::DELTA_TCCC_GT_1
;
466 norm16
=getMinNoNoDelta()+offset
;
469 case Norm::MAYBE_YES_COMBINES_FWD
:
470 norm16
=indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]+norm
.offset
*2;
472 case Norm::MAYBE_YES_SIMPLE
:
473 norm16
=Normalizer2Impl::MIN_NORMAL_MAYBE_YES
+norm
.cc
*2; // ccc=0..255
475 case Norm::YES_YES_WITH_CC
:
476 U_ASSERT(norm
.cc
!=0);
477 norm16
=Normalizer2Impl::MIN_YES_YES_WITH_CC
-2+norm
.cc
*2; // ccc=1..255
479 default: // Should not occur.
480 exit(U_INTERNAL_PROGRAM_ERROR
);
482 U_ASSERT((norm16
&1)==0);
483 if(norm
.hasCompBoundaryAfter
) {
484 norm16
|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER
;
486 IcuToolErrorCode
errorCode("gennorm2/writeNorm16()");
487 utrie2_setRange32(norm16Trie
, start
, end
, (uint32_t)norm16
, TRUE
, errorCode
);
489 // Set the minimum code points for real data lookups in the quick check loops.
491 (Norm::YES_NO_COMBINES_FWD
<=norm
.type
&& norm
.type
<=Norm::NO_NO_DELTA
) ||
493 if(isDecompNo
&& start
<indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]) {
494 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=start
;
496 UBool isCompNoMaybe
= norm
.type
>=Norm::NO_NO_COMP_YES
;
497 if(isCompNoMaybe
&& start
<indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]) {
498 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=start
;
500 if(norm
.leadCC
!=0 && start
<indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]) {
501 indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]=start
;
505 void Normalizer2DataBuilder::setHangulData() {
507 const HangulIterator::Range
*range
;
508 // Check that none of the Hangul/Jamo code points have data.
509 while((range
=hi
.nextRange())!=NULL
) {
510 for(UChar32 c
=range
->start
; c
<=range
->end
; ++c
) {
511 if(utrie2_get32(norm16Trie
, c
)>Normalizer2Impl::INERT
) {
514 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
516 exit(U_INVALID_FORMAT_ERROR
);
520 // Set data for algorithmic runtime handling.
521 IcuToolErrorCode
errorCode("gennorm2/setHangulData()");
523 // Jamo V/T are maybeYes
524 if(Hangul::JAMO_V_BASE
<indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]) {
525 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=Hangul::JAMO_V_BASE
;
527 utrie2_setRange32(norm16Trie
, Hangul::JAMO_L_BASE
, Hangul::JAMO_L_END
,
528 Normalizer2Impl::JAMO_L
, TRUE
, errorCode
);
529 utrie2_setRange32(norm16Trie
, Hangul::JAMO_V_BASE
, Hangul::JAMO_V_END
,
530 Normalizer2Impl::JAMO_VT
, TRUE
, errorCode
);
531 // JAMO_T_BASE+1: not U+11A7
532 utrie2_setRange32(norm16Trie
, Hangul::JAMO_T_BASE
+1, Hangul::JAMO_T_END
,
533 Normalizer2Impl::JAMO_VT
, TRUE
, errorCode
);
535 // Hangul LV encoded as minYesNo
536 uint32_t lv
=indexes
[Normalizer2Impl::IX_MIN_YES_NO
];
537 // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER
538 uint32_t lvt
=indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]|
539 Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER
;
540 if(Hangul::HANGUL_BASE
<indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]) {
541 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=Hangul::HANGUL_BASE
;
543 // Set the first LV, then write all other Hangul syllables as LVT,
544 // then overwrite the remaining LV.
545 // The UTrie2 should be able to compact this into 7 32-item blocks
546 // because JAMO_T_COUNT is 28 and the UTrie2 granularity is 4.
547 // (7*32=8*28 smallest common multiple)
548 utrie2_set32(norm16Trie
, Hangul::HANGUL_BASE
, lv
, errorCode
);
549 utrie2_setRange32(norm16Trie
, Hangul::HANGUL_BASE
+1, Hangul::HANGUL_END
,
550 lvt
, TRUE
, errorCode
);
551 UChar32 c
=Hangul::HANGUL_BASE
;
552 while((c
+=Hangul::JAMO_T_COUNT
)<=Hangul::HANGUL_END
) {
553 utrie2_set32(norm16Trie
, c
, lv
, errorCode
);
555 errorCode
.assertSuccess();
560 struct Norm16Summary
{
562 // ANDing values yields 0 bits where any value has a 0.
563 // Used for worst-case HAS_COMP_BOUNDARY_AFTER.
564 uint32_t andedNorm16
;
571 static UBool U_CALLCONV
572 enumRangeMaxValue(const void *context
, UChar32
/*start*/, UChar32
/*end*/, uint32_t value
) {
573 Norm16Summary
*p
=(Norm16Summary
*)context
;
574 if(value
>p
->maxNorm16
) {
577 p
->andedNorm16
&=value
;
583 void Normalizer2DataBuilder::processData() {
584 IcuToolErrorCode
errorCode("gennorm2/processData()");
585 norm16Trie
=utrie2_open(Normalizer2Impl::INERT
, Normalizer2Impl::INERT
, errorCode
);
586 errorCode
.assertSuccess();
588 // Build composition lists before recursive decomposition,
589 // so that we still have the raw, pair-wise mappings.
590 CompositionBuilder
compBuilder(norms
);
591 norms
.enumRanges(compBuilder
);
593 // Recursively decompose all mappings.
594 Decomposer
decomposer(norms
);
596 decomposer
.didDecompose
=FALSE
;
597 norms
.enumRanges(decomposer
);
598 } while(decomposer
.didDecompose
);
600 // Set the Norm::Type and other properties.
601 int32_t normsLength
=norms
.length();
602 for(int32_t i
=1; i
<normsLength
; ++i
) {
603 postProcess(norms
.getNormRefByIndex(i
));
606 // Write the properties, mappings and composition lists to
607 // appropriate parts of the "extra data" array.
608 ExtraData
extra(norms
, optimization
==OPTIMIZE_FAST
);
609 norms
.enumRanges(extra
);
611 extraData
=extra
.yesYesCompositions
;
612 indexes
[Normalizer2Impl::IX_MIN_YES_NO
]=extraData
.length()*2;
613 extraData
.append(extra
.yesNoMappingsAndCompositions
);
614 indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]=extraData
.length()*2;
615 extraData
.append(extra
.yesNoMappingsOnly
);
616 indexes
[Normalizer2Impl::IX_MIN_NO_NO
]=extraData
.length()*2;
617 extraData
.append(extra
.noNoMappingsCompYes
);
618 indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE
]=extraData
.length()*2;
619 extraData
.append(extra
.noNoMappingsCompBoundaryBefore
);
620 indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC
]=extraData
.length()*2;
621 extraData
.append(extra
.noNoMappingsCompNoMaybeCC
);
622 indexes
[Normalizer2Impl::IX_MIN_NO_NO_EMPTY
]=extraData
.length()*2;
623 extraData
.append(extra
.noNoMappingsEmpty
);
624 indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]=extraData
.length()*2;
626 // Pad the maybeYesCompositions length to a multiple of 4,
627 // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
628 while(extra
.maybeYesCompositions
.length()&3) {
629 extra
.maybeYesCompositions
.append((UChar
)0);
631 extraData
.insert(0, extra
.maybeYesCompositions
);
632 indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]=
633 Normalizer2Impl::MIN_NORMAL_MAYBE_YES
-
634 extra
.maybeYesCompositions
.length()*2;
636 // Pad to even length for 4-byte alignment of following data.
637 if(extraData
.length()&1) {
638 extraData
.append((UChar
)0);
641 int32_t minNoNoDelta
=getMinNoNoDelta();
642 U_ASSERT((minNoNoDelta
&7)==0);
643 if(indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]>minNoNoDelta
) {
646 "data structure overflow, too much mapping composition data\n");
647 exit(U_BUFFER_OVERFLOW_ERROR
);
650 // writeNorm16() and setHangulData() reduce these as needed.
651 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=0x110000;
652 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=0x110000;
653 indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]=0x110000;
655 // Map each code point to its norm16 value,
656 // including the properties that fit directly,
657 // and the offset to the "extra data" if necessary.
658 Norm16Writer
norm16Writer(norms
, *this);
659 norms
.enumRanges(norm16Writer
);
663 // Look for the "worst" norm16 value of any supplementary code point
664 // corresponding to a lead surrogate, and set it as that surrogate's value.
665 // Enables UTF-16 quick check inner loops to look at only code units.
667 // We could be more sophisticated:
668 // We could collect a bit set for whether there are values in the different
669 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
670 // and select the best value that only breaks the composition and/or decomposition
671 // inner loops if necessary.
672 // However, that seems like overkill for an optimization for supplementary characters.
673 for(UChar lead
=0xd800; lead
<0xdc00; ++lead
) {
674 uint32_t surrogateCPNorm16
=utrie2_get32(norm16Trie
, lead
);
675 Norm16Summary summary
={ surrogateCPNorm16
, surrogateCPNorm16
};
676 utrie2_enumForLeadSurrogate(norm16Trie
, lead
, NULL
, enumRangeMaxValue
, &summary
);
677 uint32_t norm16
=summary
.maxNorm16
;
678 if(norm16
>=(uint32_t)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
] &&
679 norm16
>(uint32_t)indexes
[Normalizer2Impl::IX_MIN_NO_NO
]) {
680 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
681 // Otherwise it might end up at something like JAMO_VT which stays in
682 // the inner decomposition quick check loop.
683 norm16
=(uint32_t)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]-1;
686 (norm16
&~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER
)|
687 (summary
.andedNorm16
&Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER
);
688 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie
, lead
, norm16
, errorCode
);
691 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
692 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
693 // which is harmless.
694 // As a result, the minimum code points are always BMP code points.
695 int32_t minCP
=indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
];
697 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=U16_LEAD(minCP
);
699 minCP
=indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
];
701 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=U16_LEAD(minCP
);
703 minCP
=indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
];
705 indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]=U16_LEAD(minCP
);
708 utrie2_freeze(norm16Trie
, UTRIE2_16_VALUE_BITS
, errorCode
);
709 norm16TrieLength
=utrie2_serialize(norm16Trie
, NULL
, 0, errorCode
);
710 if(errorCode
.get()!=U_BUFFER_OVERFLOW_ERROR
) {
711 fprintf(stderr
, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
712 errorCode
.errorName());
713 exit(errorCode
.reset());
717 int32_t offset
=(int32_t)sizeof(indexes
);
718 indexes
[Normalizer2Impl::IX_NORM_TRIE_OFFSET
]=offset
;
719 offset
+=norm16TrieLength
;
720 indexes
[Normalizer2Impl::IX_EXTRA_DATA_OFFSET
]=offset
;
721 offset
+=extraData
.length()*2;
722 indexes
[Normalizer2Impl::IX_SMALL_FCD_OFFSET
]=offset
;
723 offset
+=sizeof(smallFCD
);
724 int32_t totalSize
=offset
;
725 for(int32_t i
=Normalizer2Impl::IX_RESERVED3_OFFSET
; i
<=Normalizer2Impl::IX_TOTAL_SIZE
; ++i
) {
726 indexes
[i
]=totalSize
;
730 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength
);
731 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData
.length());
732 printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD
));
733 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize
);
734 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]);
735 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]);
736 printf("minLcccCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]);
737 printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_YES_NO
]);
738 printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]);
739 printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO
]);
740 printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE
]);
741 printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC
]);
742 printf("minNoNoEmpty: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO_EMPTY
]);
743 printf("limitNoNo: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]);
744 printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta
);
745 printf("minMaybeYes: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]);
748 UVersionInfo nullVersion
={ 0, 0, 0, 0 };
749 if(0==memcmp(nullVersion
, unicodeVersion
, 4)) {
750 u_versionFromString(unicodeVersion
, U_UNICODE_VERSION
);
752 memcpy(dataInfo
.dataVersion
, unicodeVersion
, 4);
755 void Normalizer2DataBuilder::writeBinaryFile(const char *filename
) {
758 IcuToolErrorCode
errorCode("gennorm2/writeBinaryFile()");
759 LocalArray
<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength
]);
760 utrie2_serialize(norm16Trie
, norm16TrieBytes
.getAlias(), norm16TrieLength
, errorCode
);
761 errorCode
.assertSuccess();
763 UNewDataMemory
*pData
=
764 udata_create(NULL
, NULL
, filename
, &dataInfo
,
765 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, errorCode
);
766 if(errorCode
.isFailure()) {
767 fprintf(stderr
, "gennorm2 error: unable to create the output file %s - %s\n",
768 filename
, errorCode
.errorName());
769 exit(errorCode
.reset());
771 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
772 udata_writeBlock(pData
, norm16TrieBytes
.getAlias(), norm16TrieLength
);
773 udata_writeUString(pData
, toUCharPtr(extraData
.getBuffer()), extraData
.length());
774 udata_writeBlock(pData
, smallFCD
, sizeof(smallFCD
));
775 int32_t writtenSize
=udata_finish(pData
, errorCode
);
776 if(errorCode
.isFailure()) {
777 fprintf(stderr
, "gennorm2: error %s writing the output file\n", errorCode
.errorName());
778 exit(errorCode
.reset());
780 int32_t totalSize
=indexes
[Normalizer2Impl::IX_TOTAL_SIZE
];
781 if(writtenSize
!=totalSize
) {
782 fprintf(stderr
, "gennorm2 error: written size %ld != calculated size %ld\n",
783 (long)writtenSize
, (long)totalSize
);
784 exit(U_INTERNAL_PROGRAM_ERROR
);
789 Normalizer2DataBuilder::writeCSourceFile(const char *filename
) {
792 IcuToolErrorCode
errorCode("gennorm2/writeCSourceFile()");
793 const char *basename
=findBasename(filename
);
794 CharString
path(filename
, (int32_t)(basename
-filename
), errorCode
);
795 CharString
dataName(basename
, errorCode
);
796 const char *extension
=strrchr(basename
, '.');
797 if(extension
!=NULL
) {
798 dataName
.truncate((int32_t)(extension
-basename
));
800 errorCode
.assertSuccess();
802 LocalArray
<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength
]);
803 utrie2_serialize(norm16Trie
, norm16TrieBytes
.getAlias(), norm16TrieLength
, errorCode
);
804 errorCode
.assertSuccess();
806 FILE *f
=usrc_create(path
.data(), basename
, "icu/source/tools/gennorm2/n2builder.cpp");
808 fprintf(stderr
, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
810 exit(U_FILE_ACCESS_ERROR
);
813 fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f
);
815 sprintf(line
, "static const UVersionInfo %s_formatVersion={", dataName
.data());
816 usrc_writeArray(f
, line
, dataInfo
.formatVersion
, 8, 4, "};\n");
817 sprintf(line
, "static const UVersionInfo %s_dataVersion={", dataName
.data());
818 usrc_writeArray(f
, line
, dataInfo
.dataVersion
, 8, 4, "};\n\n");
819 sprintf(line
, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n",
823 indexes
, 32, Normalizer2Impl::IX_COUNT
,
825 sprintf(line
, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName
.data());
826 usrc_writeUTrie2Arrays(f
,
830 sprintf(line
, "static const uint16_t %s_extraData[%%ld]={\n", dataName
.data());
833 extraData
.getBuffer(), 16, extraData
.length(),
835 sprintf(line
, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName
.data());
838 smallFCD
, 8, sizeof(smallFCD
),
840 sprintf(line
, "static const UTrie2 %s_trie={\n", dataName
.data());
842 sprintf(line2
, "%s_trieIndex", dataName
.data());
843 usrc_writeUTrie2Struct(f
,
845 norm16Trie
, line2
, NULL
,
847 fputs("\n#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f
);
853 bool equalStrings(const UnicodeString
*s1
, const UnicodeString
*s2
) {
855 return s2
== nullptr;
856 } else if(s2
== nullptr) {
863 const char *typeChars
= "?-=>";
865 void writeMapping(FILE *f
, const UnicodeString
*m
) {
866 if(m
!= nullptr && !m
->isEmpty()) {
868 UChar32 c
= m
->char32At(i
);
869 fprintf(f
, "%04lX", (long)c
);
870 while((i
+= U16_LENGTH(c
)) < m
->length()) {
872 fprintf(f
, " %04lX", (long)c
);
881 Normalizer2DataBuilder::writeDataFile(const char *filename
, bool writeRemoved
) const {
882 // Do not processData() before writing the input-syntax data file.
883 FILE *f
= fopen(filename
, "w");
885 fprintf(stderr
, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
887 exit(U_FILE_ACCESS_ERROR
);
891 if(unicodeVersion
[0] != 0 || unicodeVersion
[1] != 0 ||
892 unicodeVersion
[2] != 0 || unicodeVersion
[3] != 0) {
893 char uv
[U_MAX_VERSION_STRING_LENGTH
];
894 u_versionToString(unicodeVersion
, uv
);
895 fprintf(f
, "* Unicode %s\n\n", uv
);
898 UnicodeSetIterator
ccIter(norms
.ccSet
);
899 UChar32 start
= U_SENTINEL
;
900 UChar32 end
= U_SENTINEL
;
903 bool didWrite
= false;
907 if(ccIter
.next() && !ccIter
.isString()) {
908 c
= ccIter
.getCodepoint();
915 if(cc
== prevCC
&& c
== (end
+ 1)) {
920 fprintf(f
, "%04lX:%d\n", (long)start
, (int)prevCC
);
922 fprintf(f
, "%04lX..%04lX:%d\n", (long)start
, (long)end
, (int)prevCC
);
934 UnicodeSetIterator
mIter(norms
.mappingSet
);
937 const UnicodeString
*prevMapping
= nullptr;
938 Norm::MappingType prevType
= Norm::NONE
;
943 if(mIter
.next() && !mIter
.isString()) {
944 c
= mIter
.getCodepoint();
945 norm
= norms
.getNorm(c
);
951 const UnicodeString
*mapping
;
952 Norm::MappingType type
;
953 if(norm
== nullptr) {
957 type
= norm
->mappingType
;
958 if(type
== Norm::NONE
) {
961 mapping
= norm
->mapping
;
964 if(type
== prevType
&& equalStrings(mapping
, prevMapping
) && c
== (end
+ 1)) {
967 if(writeRemoved
? prevType
!= Norm::NONE
: prevType
> Norm::REMOVED
) {
969 fprintf(f
, "%04lX%c", (long)start
, typeChars
[prevType
]);
971 fprintf(f
, "%04lX..%04lX%c", (long)start
, (long)end
, typeChars
[prevType
]);
973 writeMapping(f
, prevMapping
);
976 prevMapping
= mapping
;
985 Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder
&b1
,
986 const Normalizer2DataBuilder
&b2
,
987 Normalizer2DataBuilder
&diff
) {
988 // Compute diff = b1 - b2
989 // so that we should be able to get b1 = b2 + diff.
990 if(0 != memcmp(b1
.unicodeVersion
, b2
.unicodeVersion
, U_MAX_VERSION_LENGTH
)) {
991 memcpy(diff
.unicodeVersion
, b1
.unicodeVersion
, U_MAX_VERSION_LENGTH
);
994 UnicodeSet
ccSet(b1
.norms
.ccSet
);
995 ccSet
.addAll(b2
.norms
.ccSet
);
996 UnicodeSetIterator
ccIter(ccSet
);
997 while(ccIter
.next() && !ccIter
.isString()) {
998 UChar32 c
= ccIter
.getCodepoint();
999 uint8_t cc1
= b1
.norms
.getCC(c
);
1000 uint8_t cc2
= b2
.norms
.getCC(c
);
1006 UnicodeSet
mSet(b1
.norms
.mappingSet
);
1007 mSet
.addAll(b2
.norms
.mappingSet
);
1008 UnicodeSetIterator
mIter(mSet
);
1009 while(mIter
.next() && !mIter
.isString()) {
1010 UChar32 c
= mIter
.getCodepoint();
1011 const Norm
*norm1
= b1
.norms
.getNorm(c
);
1012 const Norm
*norm2
= b2
.norms
.getNorm(c
);
1013 const UnicodeString
*mapping1
;
1014 Norm::MappingType type1
;
1015 if(norm1
== nullptr || !norm1
->hasMapping()) {
1019 mapping1
= norm1
->mapping
;
1020 type1
= norm1
->mappingType
;
1022 const UnicodeString
*mapping2
;
1023 Norm::MappingType type2
;
1024 if(norm2
== nullptr || !norm2
->hasMapping()) {
1028 mapping2
= norm2
->mapping
;
1029 type2
= norm2
->mappingType
;
1031 if(type1
== type2
&& equalStrings(mapping1
, mapping2
)) {
1033 } else if(type1
== Norm::NONE
) {
1034 diff
.removeMapping(c
);
1035 } else if(type1
== Norm::ROUND_TRIP
) {
1036 diff
.setRoundTripMapping(c
, *mapping1
);
1037 } else if(type1
== Norm::ONE_WAY
) {
1038 diff
.setOneWayMapping(c
, *mapping1
);
1045 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1048 * Hey, Emacs, please set the following:
1051 * indent-tabs-mode: nil