1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: n2builder.cpp
12 * tab size: 8 (not used)
15 * created on: 2009nov25
16 * created by: Markus W. Scherer
18 * Builds Normalizer2 data and writes a binary .nrm file.
19 * For the file format see source/common/normalizer2impl.h.
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
29 #include "unicode/errorcode.h"
30 #include "unicode/localpointer.h"
31 #include "unicode/putil.h"
32 #include "unicode/ucptrie.h"
33 #include "unicode/udata.h"
34 #include "unicode/umutablecptrie.h"
35 #include "unicode/uniset.h"
36 #include "unicode/unistr.h"
37 #include "unicode/usetiter.h"
38 #include "unicode/ustring.h"
40 #include "extradata.h"
42 #include "normalizer2impl.h"
49 #if !UCONFIG_NO_NORMALIZATION
51 /* UDataInfo cf. udata.h */
52 static UDataInfo dataInfo
={
61 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
62 { 4, 0, 0, 0 }, /* formatVersion */
63 { 11, 0, 0, 0 } /* dataVersion (Unicode version) */
68 class HangulIterator
{
74 HangulIterator() : rangeIndex(0) {}
75 const Range
*nextRange() {
76 if(rangeIndex
<UPRV_LENGTHOF(ranges
)) {
77 return ranges
+rangeIndex
++;
83 static const Range ranges
[4];
87 const HangulIterator::Range
HangulIterator::ranges
[4]={
88 { Hangul::JAMO_L_BASE
, Hangul::JAMO_L_END
},
89 { Hangul::JAMO_V_BASE
, Hangul::JAMO_V_END
},
90 // JAMO_T_BASE+1: not U+11A7
91 { Hangul::JAMO_T_BASE
+1, Hangul::JAMO_T_END
},
92 { Hangul::HANGUL_BASE
, Hangul::HANGUL_END
},
95 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode
&errorCode
) :
97 phase(0), overrideHandling(OVERRIDE_PREVIOUS
), optimization(OPTIMIZE_NORMAL
),
98 norm16TrieBytes(nullptr), norm16TrieLength(0) {
99 memset(unicodeVersion
, 0, sizeof(unicodeVersion
));
100 memset(indexes
, 0, sizeof(indexes
));
101 memset(smallFCD
, 0, sizeof(smallFCD
));
104 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
105 delete[] norm16TrieBytes
;
109 Normalizer2DataBuilder::setUnicodeVersion(const char *v
) {
110 UVersionInfo nullVersion
={ 0, 0, 0, 0 };
111 UVersionInfo version
;
112 u_versionFromString(version
, v
);
113 if( 0!=memcmp(version
, unicodeVersion
, U_MAX_VERSION_LENGTH
) &&
114 0!=memcmp(nullVersion
, unicodeVersion
, U_MAX_VERSION_LENGTH
)
116 char buffer
[U_MAX_VERSION_STRING_LENGTH
];
117 u_versionToString(unicodeVersion
, buffer
);
118 fprintf(stderr
, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
120 exit(U_ILLEGAL_ARGUMENT_ERROR
);
122 memcpy(unicodeVersion
, version
, U_MAX_VERSION_LENGTH
);
125 Norm
*Normalizer2DataBuilder::checkNormForMapping(Norm
*p
, UChar32 c
) {
127 if(p
->mappingType
!=Norm::NONE
) {
128 if( overrideHandling
==OVERRIDE_NONE
||
129 (overrideHandling
==OVERRIDE_PREVIOUS
&& p
->mappingPhase
==phase
)
132 "error in gennorm2 phase %d: "
133 "not permitted to override mapping for U+%04lX from phase %d\n",
134 (int)phase
, (long)c
, (int)p
->mappingPhase
);
135 exit(U_INVALID_FORMAT_ERROR
);
140 p
->mappingPhase
=phase
;
145 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh
) {
150 void Normalizer2DataBuilder::setCC(UChar32 c
, uint8_t cc
) {
151 norms
.createNorm(c
)->cc
=cc
;
155 static UBool
isWellFormed(const UnicodeString
&s
) {
156 UErrorCode errorCode
=U_ZERO_ERROR
;
157 u_strToUTF8(NULL
, 0, NULL
, toUCharPtr(s
.getBuffer()), s
.length(), &errorCode
);
158 return U_SUCCESS(errorCode
) || errorCode
==U_BUFFER_OVERFLOW_ERROR
;
161 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c
, const UnicodeString
&m
) {
162 if(!isWellFormed(m
)) {
164 "error in gennorm2 phase %d: "
165 "illegal one-way mapping from U+%04lX to malformed string\n",
166 (int)phase
, (long)c
);
167 exit(U_INVALID_FORMAT_ERROR
);
169 Norm
*p
=checkNormForMapping(norms
.createNorm(c
), c
);
170 p
->mapping
=new UnicodeString(m
);
171 p
->mappingType
=Norm::ONE_WAY
;
173 norms
.mappingSet
.add(c
);
176 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c
, const UnicodeString
&m
) {
177 if(U_IS_SURROGATE(c
)) {
179 "error in gennorm2 phase %d: "
180 "illegal round-trip mapping from surrogate code point U+%04lX\n",
181 (int)phase
, (long)c
);
182 exit(U_INVALID_FORMAT_ERROR
);
184 if(!isWellFormed(m
)) {
186 "error in gennorm2 phase %d: "
187 "illegal round-trip mapping from U+%04lX to malformed string\n",
188 (int)phase
, (long)c
);
189 exit(U_INVALID_FORMAT_ERROR
);
191 int32_t numCP
=u_countChar32(toUCharPtr(m
.getBuffer()), m
.length());
194 "error in gennorm2 phase %d: "
195 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
196 (int)phase
, (long)c
, (int)numCP
);
197 exit(U_INVALID_FORMAT_ERROR
);
199 Norm
*p
=checkNormForMapping(norms
.createNorm(c
), c
);
200 p
->mapping
=new UnicodeString(m
);
201 p
->mappingType
=Norm::ROUND_TRIP
;
202 p
->mappingCP
=U_SENTINEL
;
203 norms
.mappingSet
.add(c
);
206 void Normalizer2DataBuilder::removeMapping(UChar32 c
) {
207 // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
208 Norm
*p
=checkNormForMapping(norms
.createNorm(c
), c
);
209 p
->mappingType
=Norm::REMOVED
;
210 norms
.mappingSet
.add(c
);
213 UBool
Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer
&buffer
,
214 Norm::MappingType mappingType
) const {
215 if(buffer
.isEmpty()) {
216 return FALSE
; // Maps-to-empty-string is no boundary of any kind.
218 int32_t lastStarterIndex
=buffer
.lastStarterIndex();
219 if(lastStarterIndex
<0) {
220 return FALSE
; // no starter
222 const int32_t lastIndex
=buffer
.length()-1;
223 if(mappingType
==Norm::ONE_WAY
&& lastStarterIndex
<lastIndex
&& buffer
.ccAt(lastIndex
)>1) {
224 // One-way mapping where after the last starter is at least one combining mark
225 // with a combining class greater than 1,
226 // which means that another combining mark can reorder before it.
227 // By contrast, in a round-trip mapping this does not prevent a boundary as long as
228 // the starter or composite does not combine-forward with a following combining mark.
231 UChar32 starter
=buffer
.charAt(lastStarterIndex
);
232 if(lastStarterIndex
==0 && norms
.combinesBack(starter
)) {
233 // The last starter is at the beginning of the mapping and combines backward.
236 if(Hangul::isJamoL(starter
) ||
237 (Hangul::isJamoV(starter
) &&
238 0<lastStarterIndex
&& Hangul::isJamoL(buffer
.charAt(lastStarterIndex
-1)))) {
239 // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
240 // otherwise it is blocked.
241 return lastStarterIndex
!=lastIndex
;
243 // Note: There can be no Hangul syllable in the fully decomposed mapping.
245 // Multiple starters can combine into one.
246 // Look for the first of the last sequence of starters, excluding Jamos.
247 int32_t i
=lastStarterIndex
;
249 while(0<i
&& buffer
.ccAt(i
-1)==0 && !Hangul::isJamo(c
=buffer
.charAt(i
-1))) {
253 // Compose as far as possible, and see if further compositions with
254 // characters following this mapping are possible.
255 const Norm
*starterNorm
=norms
.getNorm(starter
);
256 if(i
==lastStarterIndex
&&
257 (starterNorm
==nullptr || starterNorm
->compositions
==nullptr)) {
258 return TRUE
; // The last starter does not combine forward.
261 while(++i
<buffer
.length()) {
262 uint8_t cc
=buffer
.ccAt(i
); // !=0 if after last starter
263 if(i
>lastStarterIndex
&& norms
.combinesWithCCBetween(*starterNorm
, prevCC
, cc
)) {
264 // The starter combines with a mark that reorders before the current one.
267 UChar32 c
=buffer
.charAt(i
);
268 if(starterNorm
!=nullptr && (prevCC
<cc
|| prevCC
==0) &&
269 norms
.getNormRef(c
).combinesBack
&& (starter
=starterNorm
->combine(c
))>=0) {
270 // The starter combines with c into a composite replacement starter.
271 starterNorm
=norms
.getNorm(starter
);
272 if(i
>=lastStarterIndex
&&
273 (starterNorm
==nullptr || starterNorm
->compositions
==nullptr)) {
274 return TRUE
; // The composite does not combine further.
276 // Keep prevCC because we "removed" the combining mark.
278 starterNorm
=norms
.getNorm(c
);
279 if(i
==lastStarterIndex
&&
280 (starterNorm
==nullptr || starterNorm
->compositions
==nullptr)) {
281 return TRUE
; // The new starter does not combine forward.
289 return FALSE
; // forward-combining starter at the very end
291 if(norms
.combinesWithCCBetween(*starterNorm
, prevCC
, 256)) {
292 // The starter combines with another mark.
298 UBool
Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer
&buffer
) const {
299 if(buffer
.lastStarterIndex()<0) {
300 return FALSE
; // no starter
302 const Norm
*starterNorm
=nullptr;
304 for(int32_t i
=0; i
<buffer
.length(); ++i
) {
305 UChar32 c
=buffer
.charAt(i
);
306 uint8_t cc
=buffer
.ccAt(i
);
307 if(starterNorm
!=nullptr && (prevCC
<cc
|| prevCC
==0) &&
308 norms
.getNormRef(c
).combinesBack
&& starterNorm
->combine(c
)>=0) {
309 return TRUE
; // normal composite
311 if(Hangul::isJamoL(c
)) {
312 if((i
+1)<buffer
.length() && Hangul::isJamoV(buffer
.charAt(i
+1))) {
313 return TRUE
; // Hangul syllable
317 starterNorm
=norms
.getNorm(c
);
325 void Normalizer2DataBuilder::postProcess(Norm
&norm
) {
326 // Prerequisites: Compositions are built, mappings are recursively decomposed.
327 // Mappings are not yet in canonical order.
329 // This function works on a Norm struct. We do not know which code point(s) map(s) to it.
330 // Therefore, we cannot compute algorithmic mapping deltas here.
331 // Error conditions are checked, but printed later when we do know the offending code point.
332 if(norm
.hasMapping()) {
333 if(norm
.mapping
->length()>Normalizer2Impl::MAPPING_LENGTH_MASK
) {
334 norm
.error
="mapping longer than maximum of 31";
337 // Ensure canonical order.
338 BuilderReorderingBuffer buffer
;
339 if(norm
.rawMapping
!=nullptr) {
340 norms
.reorder(*norm
.rawMapping
, buffer
);
343 norms
.reorder(*norm
.mapping
, buffer
);
344 if(buffer
.isEmpty()) {
345 // A character that is deleted (maps to an empty string) must
346 // get the worst-case lccc and tccc values because arbitrary
347 // characters on both sides will become adjacent.
351 norm
.leadCC
=buffer
.ccAt(0);
352 norm
.trailCC
=buffer
.ccAt(buffer
.length()-1);
355 norm
.hasCompBoundaryBefore
=
356 !buffer
.isEmpty() && norm
.leadCC
==0 && !norms
.combinesBack(buffer
.charAt(0));
357 norm
.hasCompBoundaryAfter
=
358 norm
.compositions
==nullptr && mappingHasCompBoundaryAfter(buffer
, norm
.mappingType
);
360 if(norm
.combinesBack
) {
361 norm
.error
="combines-back and decomposes, not possible in Unicode normalization";
362 } else if(norm
.mappingType
==Norm::ROUND_TRIP
) {
363 if(norm
.compositions
!=NULL
) {
364 norm
.type
=Norm::YES_NO_COMBINES_FWD
;
366 norm
.type
=Norm::YES_NO_MAPPING_ONLY
;
368 } else { // one-way mapping
369 if(norm
.compositions
!=NULL
) {
370 norm
.error
="combines-forward and has a one-way mapping, "
371 "not possible in Unicode normalization";
372 } else if(buffer
.isEmpty()) {
373 norm
.type
=Norm::NO_NO_EMPTY
;
374 } else if(!norm
.hasCompBoundaryBefore
) {
375 norm
.type
=Norm::NO_NO_COMP_NO_MAYBE_CC
;
376 } else if(mappingRecomposes(buffer
)) {
377 norm
.type
=Norm::NO_NO_COMP_BOUNDARY_BEFORE
;
379 // The mapping is comp-normalized.
380 norm
.type
=Norm::NO_NO_COMP_YES
;
383 } else { // no mapping
384 norm
.leadCC
=norm
.trailCC
=norm
.cc
;
386 norm
.hasCompBoundaryBefore
=
387 norm
.cc
==0 && !norm
.combinesBack
;
388 norm
.hasCompBoundaryAfter
=
389 norm
.cc
==0 && !norm
.combinesBack
&& norm
.compositions
==nullptr;
391 if(norm
.combinesBack
) {
392 if(norm
.compositions
!=nullptr) {
393 // Earlier code checked ccc=0.
394 norm
.type
=Norm::MAYBE_YES_COMBINES_FWD
;
396 norm
.type
=Norm::MAYBE_YES_SIMPLE
; // any ccc
398 } else if(norm
.compositions
!=nullptr) {
399 // Earlier code checked ccc=0.
400 norm
.type
=Norm::YES_YES_COMBINES_FWD
;
401 } else if(norm
.cc
!=0) {
402 norm
.type
=Norm::YES_YES_WITH_CC
;
404 norm
.type
=Norm::INERT
;
409 class Norm16Writer
: public Norms::Enumerator
{
411 Norm16Writer(UMutableCPTrie
*trie
, Norms
&n
, Normalizer2DataBuilder
&b
) :
412 Norms::Enumerator(n
), builder(b
), norm16Trie(trie
) {}
413 void rangeHandler(UChar32 start
, UChar32 end
, Norm
&norm
) U_OVERRIDE
{
414 builder
.writeNorm16(norm16Trie
, start
, end
, norm
);
416 Normalizer2DataBuilder
&builder
;
417 UMutableCPTrie
*norm16Trie
;
420 void Normalizer2DataBuilder::setSmallFCD(UChar32 c
) {
421 UChar32 lead
= c
<=0xffff ? c
: U16_LEAD(c
);
422 smallFCD
[lead
>>8]|=(uint8_t)1<<((lead
>>5)&7);
425 void Normalizer2DataBuilder::writeNorm16(UMutableCPTrie
*norm16Trie
, UChar32 start
, UChar32 end
, Norm
&norm
) {
426 if((norm
.leadCC
|norm
.trailCC
)!=0) {
427 for(UChar32 c
=start
; c
<=end
; ++c
) {
435 norm16
=Normalizer2Impl::INERT
;
437 case Norm::YES_YES_COMBINES_FWD
:
438 norm16
=norm
.offset
*2;
440 case Norm::YES_NO_COMBINES_FWD
:
441 norm16
=indexes
[Normalizer2Impl::IX_MIN_YES_NO
]+norm
.offset
*2;
443 case Norm::YES_NO_MAPPING_ONLY
:
444 norm16
=indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]+norm
.offset
*2;
446 case Norm::NO_NO_COMP_YES
:
447 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO
]+norm
.offset
*2;
449 case Norm::NO_NO_COMP_BOUNDARY_BEFORE
:
450 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE
]+norm
.offset
*2;
452 case Norm::NO_NO_COMP_NO_MAYBE_CC
:
453 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC
]+norm
.offset
*2;
455 case Norm::NO_NO_EMPTY
:
456 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO_EMPTY
]+norm
.offset
*2;
458 case Norm::NO_NO_DELTA
:
460 // Positive offset from minNoNoDelta, shifted left for additional bits.
461 int32_t offset
=(norm
.offset
+Normalizer2Impl::MAX_DELTA
)<<Normalizer2Impl::DELTA_SHIFT
;
462 if(norm
.trailCC
==0) {
464 } else if(norm
.trailCC
==1) {
465 offset
|=Normalizer2Impl::DELTA_TCCC_1
;
467 offset
|=Normalizer2Impl::DELTA_TCCC_GT_1
;
469 norm16
=getMinNoNoDelta()+offset
;
472 case Norm::MAYBE_YES_COMBINES_FWD
:
473 norm16
=indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]+norm
.offset
*2;
475 case Norm::MAYBE_YES_SIMPLE
:
476 norm16
=Normalizer2Impl::MIN_NORMAL_MAYBE_YES
+norm
.cc
*2; // ccc=0..255
478 case Norm::YES_YES_WITH_CC
:
479 U_ASSERT(norm
.cc
!=0);
480 norm16
=Normalizer2Impl::MIN_YES_YES_WITH_CC
-2+norm
.cc
*2; // ccc=1..255
482 default: // Should not occur.
483 exit(U_INTERNAL_PROGRAM_ERROR
);
485 U_ASSERT((norm16
&1)==0);
486 if(norm
.hasCompBoundaryAfter
) {
487 norm16
|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER
;
489 IcuToolErrorCode
errorCode("gennorm2/writeNorm16()");
490 umutablecptrie_setRange(norm16Trie
, start
, end
, (uint32_t)norm16
, errorCode
);
492 // Set the minimum code points for real data lookups in the quick check loops.
494 (Norm::YES_NO_COMBINES_FWD
<=norm
.type
&& norm
.type
<=Norm::NO_NO_DELTA
) ||
496 if(isDecompNo
&& start
<indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]) {
497 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=start
;
499 UBool isCompNoMaybe
= norm
.type
>=Norm::NO_NO_COMP_YES
;
500 if(isCompNoMaybe
&& start
<indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]) {
501 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=start
;
503 if(norm
.leadCC
!=0 && start
<indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]) {
504 indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]=start
;
508 void Normalizer2DataBuilder::setHangulData(UMutableCPTrie
*norm16Trie
) {
510 const HangulIterator::Range
*range
;
511 // Check that none of the Hangul/Jamo code points have data.
512 while((range
=hi
.nextRange())!=NULL
) {
513 for(UChar32 c
=range
->start
; c
<=range
->end
; ++c
) {
514 if(umutablecptrie_get(norm16Trie
, c
)>Normalizer2Impl::INERT
) {
517 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
519 exit(U_INVALID_FORMAT_ERROR
);
523 // Set data for algorithmic runtime handling.
524 IcuToolErrorCode
errorCode("gennorm2/setHangulData()");
526 // Jamo V/T are maybeYes
527 if(Hangul::JAMO_V_BASE
<indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]) {
528 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=Hangul::JAMO_V_BASE
;
530 umutablecptrie_setRange(norm16Trie
, Hangul::JAMO_L_BASE
, Hangul::JAMO_L_END
,
531 Normalizer2Impl::JAMO_L
, errorCode
);
532 umutablecptrie_setRange(norm16Trie
, Hangul::JAMO_V_BASE
, Hangul::JAMO_V_END
,
533 Normalizer2Impl::JAMO_VT
, errorCode
);
534 // JAMO_T_BASE+1: not U+11A7
535 umutablecptrie_setRange(norm16Trie
, Hangul::JAMO_T_BASE
+1, Hangul::JAMO_T_END
,
536 Normalizer2Impl::JAMO_VT
, errorCode
);
538 // Hangul LV encoded as minYesNo
539 uint32_t lv
=indexes
[Normalizer2Impl::IX_MIN_YES_NO
];
540 // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER
541 uint32_t lvt
=indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]|
542 Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER
;
543 if(Hangul::HANGUL_BASE
<indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]) {
544 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=Hangul::HANGUL_BASE
;
546 // Set the first LV, then write all other Hangul syllables as LVT,
547 // then overwrite the remaining LV.
548 umutablecptrie_set(norm16Trie
, Hangul::HANGUL_BASE
, lv
, errorCode
);
549 umutablecptrie_setRange(norm16Trie
, Hangul::HANGUL_BASE
+1, Hangul::HANGUL_END
, lvt
, errorCode
);
550 UChar32 c
=Hangul::HANGUL_BASE
;
551 while((c
+=Hangul::JAMO_T_COUNT
)<=Hangul::HANGUL_END
) {
552 umutablecptrie_set(norm16Trie
, c
, lv
, errorCode
);
554 errorCode
.assertSuccess();
557 LocalUCPTriePointer
Normalizer2DataBuilder::processData() {
558 // Build composition lists before recursive decomposition,
559 // so that we still have the raw, pair-wise mappings.
560 CompositionBuilder
compBuilder(norms
);
561 norms
.enumRanges(compBuilder
);
563 // Recursively decompose all mappings.
564 Decomposer
decomposer(norms
);
566 decomposer
.didDecompose
=FALSE
;
567 norms
.enumRanges(decomposer
);
568 } while(decomposer
.didDecompose
);
570 // Set the Norm::Type and other properties.
571 int32_t normsLength
=norms
.length();
572 for(int32_t i
=1; i
<normsLength
; ++i
) {
573 postProcess(norms
.getNormRefByIndex(i
));
576 // Write the properties, mappings and composition lists to
577 // appropriate parts of the "extra data" array.
578 ExtraData
extra(norms
, optimization
==OPTIMIZE_FAST
);
579 norms
.enumRanges(extra
);
581 extraData
=extra
.yesYesCompositions
;
582 indexes
[Normalizer2Impl::IX_MIN_YES_NO
]=extraData
.length()*2;
583 extraData
.append(extra
.yesNoMappingsAndCompositions
);
584 indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]=extraData
.length()*2;
585 extraData
.append(extra
.yesNoMappingsOnly
);
586 indexes
[Normalizer2Impl::IX_MIN_NO_NO
]=extraData
.length()*2;
587 extraData
.append(extra
.noNoMappingsCompYes
);
588 indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE
]=extraData
.length()*2;
589 extraData
.append(extra
.noNoMappingsCompBoundaryBefore
);
590 indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC
]=extraData
.length()*2;
591 extraData
.append(extra
.noNoMappingsCompNoMaybeCC
);
592 indexes
[Normalizer2Impl::IX_MIN_NO_NO_EMPTY
]=extraData
.length()*2;
593 extraData
.append(extra
.noNoMappingsEmpty
);
594 indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]=extraData
.length()*2;
596 // Pad the maybeYesCompositions length to a multiple of 4,
597 // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
598 while(extra
.maybeYesCompositions
.length()&3) {
599 extra
.maybeYesCompositions
.append((UChar
)0);
601 extraData
.insert(0, extra
.maybeYesCompositions
);
602 indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]=
603 Normalizer2Impl::MIN_NORMAL_MAYBE_YES
-
604 extra
.maybeYesCompositions
.length()*2;
606 // Pad to even length for 4-byte alignment of following data.
607 if(extraData
.length()&1) {
608 extraData
.append((UChar
)0);
611 int32_t minNoNoDelta
=getMinNoNoDelta();
612 U_ASSERT((minNoNoDelta
&7)==0);
613 if(indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]>minNoNoDelta
) {
616 "data structure overflow, too much mapping composition data\n");
617 exit(U_BUFFER_OVERFLOW_ERROR
);
620 // writeNorm16() and setHangulData() reduce these as needed.
621 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=0x110000;
622 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=0x110000;
623 indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]=0x110000;
625 IcuToolErrorCode
errorCode("gennorm2/processData()");
626 UMutableCPTrie
*norm16Trie
= umutablecptrie_open(
627 Normalizer2Impl::INERT
, Normalizer2Impl::INERT
, errorCode
);
628 errorCode
.assertSuccess();
630 // Map each code point to its norm16 value,
631 // including the properties that fit directly,
632 // and the offset to the "extra data" if necessary.
633 Norm16Writer
norm16Writer(norm16Trie
, norms
, *this);
634 norms
.enumRanges(norm16Writer
);
635 // TODO: iterate via getRange() instead of callback?
637 setHangulData(norm16Trie
);
639 // Look for the "worst" norm16 value of any supplementary code point
640 // corresponding to a lead surrogate, and set it as that surrogate's value.
641 // Enables UTF-16 quick check inner loops to look at only code units.
643 // We could be more sophisticated:
644 // We could collect a bit set for whether there are values in the different
645 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
646 // and select the best value that only breaks the composition and/or decomposition
647 // inner loops if necessary.
648 // However, that seems like overkill for an optimization for supplementary characters.
650 // First check that surrogate code *points* are inert.
651 // The parser should have rejected values/mappings for them.
653 UChar32 end
= umutablecptrie_getRange(norm16Trie
, 0xd800, UCPMAP_RANGE_NORMAL
, 0,
654 nullptr, nullptr, &value
);
655 if (value
!= Normalizer2Impl::INERT
|| end
< 0xdfff) {
657 "gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n",
658 (int)end
, (long)value
);
659 exit(U_INTERNAL_PROGRAM_ERROR
);
661 uint32_t maxNorm16
= 0;
662 // ANDing values yields 0 bits where any value has a 0.
663 // Used for worst-case HAS_COMP_BOUNDARY_AFTER.
664 uint32_t andedNorm16
= 0;
666 for (UChar32 start
= 0x10000;;) {
668 end
= umutablecptrie_getRange(norm16Trie
, start
, UCPMAP_RANGE_NORMAL
, 0,
669 nullptr, nullptr, &value
);
670 if (end
< 0) { break; }
672 if ((start
& 0x3ff) == 0) {
673 // Data for a new lead surrogate.
674 maxNorm16
= andedNorm16
= value
;
676 if (value
> maxNorm16
) {
679 andedNorm16
&= value
;
681 // Intersect each range with the code points for one lead surrogate.
682 UChar32 leadEnd
= start
| 0x3ff;
683 if (leadEnd
<= end
) {
684 // End of the supplementary block for a lead surrogate.
685 if (maxNorm16
>= (uint32_t)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]) {
686 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
687 // Otherwise it might end up at something like JAMO_VT which stays in
688 // the inner decomposition quick check loop.
689 maxNorm16
= (uint32_t)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
];
692 (maxNorm16
& ~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER
)|
693 (andedNorm16
& Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER
);
694 if (maxNorm16
!= Normalizer2Impl::INERT
) {
695 umutablecptrie_set(norm16Trie
, U16_LEAD(start
), maxNorm16
, errorCode
);
697 if (value
== Normalizer2Impl::INERT
) {
698 // Potentially skip inert supplementary blocks for several lead surrogates.
699 start
= (end
+ 1) & ~0x3ff;
708 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
709 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
710 // which is harmless.
711 // As a result, the minimum code points are always BMP code points.
712 int32_t minCP
=indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
];
714 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=U16_LEAD(minCP
);
716 minCP
=indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
];
718 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=U16_LEAD(minCP
);
720 minCP
=indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
];
722 indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]=U16_LEAD(minCP
);
725 LocalUCPTriePointer
builtTrie(
726 umutablecptrie_buildImmutable(norm16Trie
, UCPTRIE_TYPE_FAST
, UCPTRIE_VALUE_BITS_16
, errorCode
));
727 norm16TrieLength
=ucptrie_toBinary(builtTrie
.getAlias(), nullptr, 0, errorCode
);
728 if(errorCode
.get()!=U_BUFFER_OVERFLOW_ERROR
) {
729 fprintf(stderr
, "gennorm2 error: unable to build/serialize the normalization trie - %s\n",
730 errorCode
.errorName());
731 exit(errorCode
.reset());
733 umutablecptrie_close(norm16Trie
);
735 norm16TrieBytes
=new uint8_t[norm16TrieLength
];
736 ucptrie_toBinary(builtTrie
.getAlias(), norm16TrieBytes
, norm16TrieLength
, errorCode
);
737 errorCode
.assertSuccess();
739 int32_t offset
=(int32_t)sizeof(indexes
);
740 indexes
[Normalizer2Impl::IX_NORM_TRIE_OFFSET
]=offset
;
741 offset
+=norm16TrieLength
;
742 indexes
[Normalizer2Impl::IX_EXTRA_DATA_OFFSET
]=offset
;
743 offset
+=extraData
.length()*2;
744 indexes
[Normalizer2Impl::IX_SMALL_FCD_OFFSET
]=offset
;
745 offset
+=sizeof(smallFCD
);
746 int32_t totalSize
=offset
;
747 for(int32_t i
=Normalizer2Impl::IX_RESERVED3_OFFSET
; i
<=Normalizer2Impl::IX_TOTAL_SIZE
; ++i
) {
748 indexes
[i
]=totalSize
;
752 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength
);
753 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData
.length());
754 printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD
));
755 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize
);
756 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]);
757 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]);
758 printf("minLcccCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_LCCC_CP
]);
759 printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_YES_NO
]);
760 printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]);
761 printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO
]);
762 printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE
]);
763 printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC
]);
764 printf("minNoNoEmpty: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO_EMPTY
]);
765 printf("limitNoNo: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]);
766 printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta
);
767 printf("minMaybeYes: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]);
770 UVersionInfo nullVersion
={ 0, 0, 0, 0 };
771 if(0==memcmp(nullVersion
, unicodeVersion
, 4)) {
772 u_versionFromString(unicodeVersion
, U_UNICODE_VERSION
);
774 memcpy(dataInfo
.dataVersion
, unicodeVersion
, 4);
778 void Normalizer2DataBuilder::writeBinaryFile(const char *filename
) {
781 IcuToolErrorCode
errorCode("gennorm2/writeBinaryFile()");
782 UNewDataMemory
*pData
=
783 udata_create(NULL
, NULL
, filename
, &dataInfo
,
784 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, errorCode
);
785 if(errorCode
.isFailure()) {
786 fprintf(stderr
, "gennorm2 error: unable to create the output file %s - %s\n",
787 filename
, errorCode
.errorName());
788 exit(errorCode
.reset());
790 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
791 udata_writeBlock(pData
, norm16TrieBytes
, norm16TrieLength
);
792 udata_writeUString(pData
, toUCharPtr(extraData
.getBuffer()), extraData
.length());
793 udata_writeBlock(pData
, smallFCD
, sizeof(smallFCD
));
794 int32_t writtenSize
=udata_finish(pData
, errorCode
);
795 if(errorCode
.isFailure()) {
796 fprintf(stderr
, "gennorm2: error %s writing the output file\n", errorCode
.errorName());
797 exit(errorCode
.reset());
799 int32_t totalSize
=indexes
[Normalizer2Impl::IX_TOTAL_SIZE
];
800 if(writtenSize
!=totalSize
) {
801 fprintf(stderr
, "gennorm2 error: written size %ld != calculated size %ld\n",
802 (long)writtenSize
, (long)totalSize
);
803 exit(U_INTERNAL_PROGRAM_ERROR
);
808 Normalizer2DataBuilder::writeCSourceFile(const char *filename
) {
809 LocalUCPTriePointer norm16Trie
= processData();
811 IcuToolErrorCode
errorCode("gennorm2/writeCSourceFile()");
812 const char *basename
=findBasename(filename
);
813 CharString
path(filename
, (int32_t)(basename
-filename
), errorCode
);
814 CharString
dataName(basename
, errorCode
);
815 const char *extension
=strrchr(basename
, '.');
816 if(extension
!=NULL
) {
817 dataName
.truncate((int32_t)(extension
-basename
));
819 const char *name
=dataName
.data();
820 errorCode
.assertSuccess();
822 FILE *f
=usrc_create(path
.data(), basename
, 2016, "icu/source/tools/gennorm2/n2builder.cpp");
824 fprintf(stderr
, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
826 exit(U_FILE_ACCESS_ERROR
);
828 fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f
);
831 sprintf(line
, "static const UVersionInfo %s_formatVersion={", name
);
832 usrc_writeArray(f
, line
, dataInfo
.formatVersion
, 8, 4, "};\n");
833 sprintf(line
, "static const UVersionInfo %s_dataVersion={", name
);
834 usrc_writeArray(f
, line
, dataInfo
.dataVersion
, 8, 4, "};\n\n");
835 sprintf(line
, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", name
);
836 usrc_writeArray(f
, line
, indexes
, 32, Normalizer2Impl::IX_COUNT
, "\n};\n\n");
838 usrc_writeUCPTrie(f
, name
, norm16Trie
.getAlias());
840 sprintf(line
, "static const uint16_t %s_extraData[%%ld]={\n", name
);
841 usrc_writeArray(f
, line
, extraData
.getBuffer(), 16, extraData
.length(), "\n};\n\n");
842 sprintf(line
, "static const uint8_t %s_smallFCD[%%ld]={\n", name
);
843 usrc_writeArray(f
, line
, smallFCD
, 8, sizeof(smallFCD
), "\n};\n\n");
845 fputs("#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f
);
851 bool equalStrings(const UnicodeString
*s1
, const UnicodeString
*s2
) {
853 return s2
== nullptr;
854 } else if(s2
== nullptr) {
861 const char *typeChars
= "?-=>";
863 void writeMapping(FILE *f
, const UnicodeString
*m
) {
864 if(m
!= nullptr && !m
->isEmpty()) {
866 UChar32 c
= m
->char32At(i
);
867 fprintf(f
, "%04lX", (long)c
);
868 while((i
+= U16_LENGTH(c
)) < m
->length()) {
870 fprintf(f
, " %04lX", (long)c
);
879 Normalizer2DataBuilder::writeDataFile(const char *filename
, bool writeRemoved
) const {
880 // Do not processData() before writing the input-syntax data file.
881 FILE *f
= fopen(filename
, "w");
883 fprintf(stderr
, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
885 exit(U_FILE_ACCESS_ERROR
);
889 if(unicodeVersion
[0] != 0 || unicodeVersion
[1] != 0 ||
890 unicodeVersion
[2] != 0 || unicodeVersion
[3] != 0) {
891 char uv
[U_MAX_VERSION_STRING_LENGTH
];
892 u_versionToString(unicodeVersion
, uv
);
893 fprintf(f
, "* Unicode %s\n\n", uv
);
896 UnicodeSetIterator
ccIter(norms
.ccSet
);
897 UChar32 start
= U_SENTINEL
;
898 UChar32 end
= U_SENTINEL
;
901 bool didWrite
= false;
905 if(ccIter
.next() && !ccIter
.isString()) {
906 c
= ccIter
.getCodepoint();
913 if(cc
== prevCC
&& c
== (end
+ 1)) {
918 fprintf(f
, "%04lX:%d\n", (long)start
, (int)prevCC
);
920 fprintf(f
, "%04lX..%04lX:%d\n", (long)start
, (long)end
, (int)prevCC
);
932 UnicodeSetIterator
mIter(norms
.mappingSet
);
935 const UnicodeString
*prevMapping
= nullptr;
936 Norm::MappingType prevType
= Norm::NONE
;
941 if(mIter
.next() && !mIter
.isString()) {
942 c
= mIter
.getCodepoint();
943 norm
= norms
.getNorm(c
);
949 const UnicodeString
*mapping
;
950 Norm::MappingType type
;
951 if(norm
== nullptr) {
955 type
= norm
->mappingType
;
956 if(type
== Norm::NONE
) {
959 mapping
= norm
->mapping
;
962 if(type
== prevType
&& equalStrings(mapping
, prevMapping
) && c
== (end
+ 1)) {
965 if(writeRemoved
? prevType
!= Norm::NONE
: prevType
> Norm::REMOVED
) {
967 fprintf(f
, "%04lX%c", (long)start
, typeChars
[prevType
]);
969 fprintf(f
, "%04lX..%04lX%c", (long)start
, (long)end
, typeChars
[prevType
]);
971 writeMapping(f
, prevMapping
);
974 prevMapping
= mapping
;
983 Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder
&b1
,
984 const Normalizer2DataBuilder
&b2
,
985 Normalizer2DataBuilder
&diff
) {
986 // Compute diff = b1 - b2
987 // so that we should be able to get b1 = b2 + diff.
988 if(0 != memcmp(b1
.unicodeVersion
, b2
.unicodeVersion
, U_MAX_VERSION_LENGTH
)) {
989 memcpy(diff
.unicodeVersion
, b1
.unicodeVersion
, U_MAX_VERSION_LENGTH
);
992 UnicodeSet
ccSet(b1
.norms
.ccSet
);
993 ccSet
.addAll(b2
.norms
.ccSet
);
994 UnicodeSetIterator
ccIter(ccSet
);
995 while(ccIter
.next() && !ccIter
.isString()) {
996 UChar32 c
= ccIter
.getCodepoint();
997 uint8_t cc1
= b1
.norms
.getCC(c
);
998 uint8_t cc2
= b2
.norms
.getCC(c
);
1004 UnicodeSet
mSet(b1
.norms
.mappingSet
);
1005 mSet
.addAll(b2
.norms
.mappingSet
);
1006 UnicodeSetIterator
mIter(mSet
);
1007 while(mIter
.next() && !mIter
.isString()) {
1008 UChar32 c
= mIter
.getCodepoint();
1009 const Norm
*norm1
= b1
.norms
.getNorm(c
);
1010 const Norm
*norm2
= b2
.norms
.getNorm(c
);
1011 const UnicodeString
*mapping1
;
1012 Norm::MappingType type1
;
1013 if(norm1
== nullptr || !norm1
->hasMapping()) {
1017 mapping1
= norm1
->mapping
;
1018 type1
= norm1
->mappingType
;
1020 const UnicodeString
*mapping2
;
1021 Norm::MappingType type2
;
1022 if(norm2
== nullptr || !norm2
->hasMapping()) {
1026 mapping2
= norm2
->mapping
;
1027 type2
= norm2
->mappingType
;
1029 if(type1
== type2
&& equalStrings(mapping1
, mapping2
)) {
1031 } else if(type1
== Norm::NONE
) {
1032 diff
.removeMapping(c
);
1033 } else if(type1
== Norm::ROUND_TRIP
) {
1034 diff
.setRoundTripMapping(c
, *mapping1
);
1035 } else if(type1
== Norm::ONE_WAY
) {
1036 diff
.setOneWayMapping(c
, *mapping1
);
1043 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1046 * Hey, Emacs, please set the following:
1049 * indent-tabs-mode: nil