2 *******************************************************************************
4 * Copyright (C) 2009-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: n2builder.cpp
10 * tab size: 8 (not used)
13 * created on: 2009nov25
14 * created by: Markus W. Scherer
16 * Builds Normalizer2 data and writes a binary .nrm file.
17 * For the file format see source/common/normalizer2impl.h.
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
29 #include "unicode/errorcode.h"
30 #include "unicode/localpointer.h"
31 #include "unicode/putil.h"
32 #include "unicode/udata.h"
33 #include "unicode/uniset.h"
34 #include "unicode/unistr.h"
35 #include "unicode/ustring.h"
37 #include "normalizer2impl.h"
43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45 #if !UCONFIG_NO_NORMALIZATION
47 /* UDataInfo cf. udata.h */
48 static UDataInfo dataInfo
={
57 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
58 { 1, 0, 0, 0 }, /* formatVersion */
59 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */
64 class HangulIterator
{
71 HangulIterator() : rangeIndex(0) {}
72 const Range
*nextRange() {
73 if(rangeIndex
<LENGTHOF(ranges
)) {
74 return ranges
+rangeIndex
++;
79 void reset() { rangeIndex
=0; }
81 static const Range ranges
[4];
85 const HangulIterator::Range
HangulIterator::ranges
[4]={
86 { Hangul::JAMO_L_BASE
, Hangul::JAMO_L_BASE
+Hangul::JAMO_L_COUNT
, 1 },
87 { Hangul::JAMO_V_BASE
, Hangul::JAMO_V_BASE
+Hangul::JAMO_V_COUNT
, Normalizer2Impl::JAMO_VT
},
88 // JAMO_T_BASE+1: not U+11A7
89 { Hangul::JAMO_T_BASE
+1, Hangul::JAMO_T_BASE
+Hangul::JAMO_T_COUNT
, Normalizer2Impl::JAMO_VT
},
90 { Hangul::HANGUL_BASE
, Hangul::HANGUL_BASE
+Hangul::HANGUL_COUNT
, 0 }, // will become minYesNo
93 struct CompositionPair
{
94 CompositionPair(UChar32 t
, UChar32 c
) : trail(t
), composite(c
) {}
95 UChar32 trail
, composite
;
99 enum MappingType
{ NONE
, REMOVED
, ROUND_TRIP
, ONE_WAY
};
101 UBool
hasMapping() const { return mappingType
>REMOVED
; }
103 // Requires hasMapping() and well-formed mapping.
104 void setMappingCP() {
106 if(!mapping
->isEmpty() && mapping
->length()==U16_LENGTH(c
=mapping
->char32At(0))) {
109 mappingCP
=U_SENTINEL
;
113 const CompositionPair
*getCompositionPairs(int32_t &length
) const {
114 if(compositions
==NULL
) {
118 length
=compositions
->size()/2;
119 return reinterpret_cast<const CompositionPair
*>(compositions
->getBuffer());
123 UnicodeString
*mapping
;
124 UChar32 mappingCP
; // >=0 if mapping to 1 code point
125 int32_t mappingPhase
;
126 MappingType mappingType
;
128 UVector32
*compositions
; // (trail, composite) pairs
131 UBool hasNoCompBoundaryAfter
;
134 OFFSET_NONE
, OFFSET_MAYBE_YES
,
135 OFFSET_YES_YES
, OFFSET_YES_NO
, OFFSET_NO_NO
,
138 enum { OFFSET_SHIFT
=4, OFFSET_MASK
=(1<<OFFSET_SHIFT
)-1 };
142 class Normalizer2DBEnumerator
{
144 Normalizer2DBEnumerator(Normalizer2DataBuilder
&b
) : builder(b
) {}
145 virtual ~Normalizer2DBEnumerator() {}
146 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) = 0;
147 Normalizer2DBEnumerator
*ptr() { return this; }
149 Normalizer2DataBuilder
&builder
;
154 static UBool U_CALLCONV
155 enumRangeHandler(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
156 return ((Normalizer2DBEnumerator
*)context
)->rangeHandler(start
, end
, value
);
161 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode
&errorCode
) :
162 phase(0), overrideHandling(OVERRIDE_PREVIOUS
), optimization(OPTIMIZE_NORMAL
) {
163 memset(unicodeVersion
, 0, sizeof(unicodeVersion
));
164 normTrie
=utrie2_open(0, 0, &errorCode
);
165 normMem
=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm
));
166 norms
=allocNorm(); // unused Norm struct at index 0
167 memset(indexes
, 0, sizeof(indexes
));
170 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
171 utrie2_close(normTrie
);
172 int32_t normsLength
=utm_countItems(normMem
);
173 for(int32_t i
=1; i
<normsLength
; ++i
) {
174 delete norms
[i
].mapping
;
175 delete norms
[i
].compositions
;
178 utrie2_close(norm16Trie
);
182 Normalizer2DataBuilder::setUnicodeVersion(const char *v
) {
183 u_versionFromString(unicodeVersion
, v
);
186 Norm
*Normalizer2DataBuilder::allocNorm() {
187 Norm
*p
=(Norm
*)utm_alloc(normMem
);
188 norms
=(Norm
*)utm_getStart(normMem
); // in case it got reallocated
192 /* get an existing Norm unit */
193 Norm
*Normalizer2DataBuilder::getNorm(UChar32 c
) {
194 uint32_t i
=utrie2_get32(normTrie
, c
);
201 const Norm
&Normalizer2DataBuilder::getNormRef(UChar32 c
) const {
202 return norms
[utrie2_get32(normTrie
, c
)];
206 * get or create a Norm unit;
207 * get or create the intermediate trie entries for it as well
209 Norm
*Normalizer2DataBuilder::createNorm(UChar32 c
) {
210 uint32_t i
=utrie2_get32(normTrie
, c
);
216 IcuToolErrorCode
errorCode("gennorm2/createNorm()");
217 utrie2_set32(normTrie
, c
, (uint32_t)(p
-norms
), errorCode
);
222 Norm
*Normalizer2DataBuilder::checkNormForMapping(Norm
*p
, UChar32 c
) {
224 if(p
->mappingType
!=Norm::NONE
) {
225 if( overrideHandling
==OVERRIDE_NONE
||
226 (overrideHandling
==OVERRIDE_PREVIOUS
&& p
->mappingPhase
==phase
)
229 "error in gennorm2 phase %d: "
230 "not permitted to override mapping for U+%04lX from phase %d\n",
231 (int)phase
, (long)c
, (int)p
->mappingPhase
);
232 exit(U_INVALID_FORMAT_ERROR
);
237 p
->mappingPhase
=phase
;
242 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh
) {
247 void Normalizer2DataBuilder::setCC(UChar32 c
, uint8_t cc
) {
248 createNorm(c
)->cc
=cc
;
251 uint8_t Normalizer2DataBuilder::getCC(UChar32 c
) const {
252 return getNormRef(c
).cc
;
255 static UBool
isWellFormed(const UnicodeString
&s
) {
256 UErrorCode errorCode
=U_ZERO_ERROR
;
257 u_strToUTF8(NULL
, 0, NULL
, s
.getBuffer(), s
.length(), &errorCode
);
258 return U_SUCCESS(errorCode
) || errorCode
==U_BUFFER_OVERFLOW_ERROR
;
261 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c
, const UnicodeString
&m
) {
262 if(!isWellFormed(m
)) {
264 "error in gennorm2 phase %d: "
265 "illegal one-way mapping from U+%04lX to malformed string\n",
266 (int)phase
, (long)c
);
267 exit(U_INVALID_FORMAT_ERROR
);
269 Norm
*p
=checkNormForMapping(createNorm(c
), c
);
270 p
->mapping
=new UnicodeString(m
);
271 p
->mappingType
=Norm::ONE_WAY
;
275 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c
, const UnicodeString
&m
) {
276 if(U_IS_SURROGATE(c
)) {
278 "error in gennorm2 phase %d: "
279 "illegal round-trip mapping from surrogate code point U+%04lX\n",
280 (int)phase
, (long)c
);
281 exit(U_INVALID_FORMAT_ERROR
);
283 if(!isWellFormed(m
)) {
285 "error in gennorm2 phase %d: "
286 "illegal round-trip mapping from U+%04lX to malformed string\n",
287 (int)phase
, (long)c
);
288 exit(U_INVALID_FORMAT_ERROR
);
290 int32_t numCP
=u_countChar32(m
.getBuffer(), m
.length());
293 "error in gennorm2 phase %d: "
294 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
295 (int)phase
, (long)c
, (int)numCP
);
296 exit(U_INVALID_FORMAT_ERROR
);
298 Norm
*p
=checkNormForMapping(createNorm(c
), c
);
299 p
->mapping
=new UnicodeString(m
);
300 p
->mappingType
=Norm::ROUND_TRIP
;
301 p
->mappingCP
=U_SENTINEL
;
304 void Normalizer2DataBuilder::removeMapping(UChar32 c
) {
305 Norm
*p
=checkNormForMapping(getNorm(c
), c
);
307 p
->mappingType
=Norm::REMOVED
;
311 class CompositionBuilder
: public Normalizer2DBEnumerator
{
313 CompositionBuilder(Normalizer2DataBuilder
&b
) : Normalizer2DBEnumerator(b
) {}
314 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
315 builder
.addComposition(start
, end
, value
);
321 Normalizer2DataBuilder::addComposition(UChar32 start
, UChar32 end
, uint32_t value
) {
322 if(norms
[value
].mappingType
==Norm::ROUND_TRIP
) {
325 "gennorm2 error: same round-trip mapping for "
326 "more than 1 code point U+%04lX..U+%04lX\n",
327 (long)start
, (long)end
);
328 exit(U_INVALID_FORMAT_ERROR
);
330 if(norms
[value
].cc
!=0) {
333 "U+%04lX has a round-trip mapping and ccc!=0, "
334 "not possible in Unicode normalization\n",
336 exit(U_INVALID_FORMAT_ERROR
);
338 // setRoundTripMapping() ensured that there are exactly two code points.
339 const UnicodeString
&m
=*norms
[value
].mapping
;
340 UChar32 lead
=m
.char32At(0);
341 UChar32 trail
=m
.char32At(m
.length()-1);
345 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
346 "not possible in Unicode normalization\n",
347 (long)start
, (long)lead
);
348 exit(U_INVALID_FORMAT_ERROR
);
350 // Flag for trailing character.
351 createNorm(trail
)->combinesBack
=TRUE
;
352 // Insert (trail, composite) pair into compositions list for the lead character.
353 IcuToolErrorCode
errorCode("gennorm2/addComposition()");
354 Norm
*leadNorm
=createNorm(lead
);
355 UVector32
*compositions
=leadNorm
->compositions
;
357 if(compositions
==NULL
) {
358 compositions
=leadNorm
->compositions
=new UVector32(errorCode
);
359 i
=0; // "insert" the first pair at index 0
361 // Insertion sort, and check for duplicate trail characters.
363 const CompositionPair
*pairs
=leadNorm
->getCompositionPairs(length
);
364 for(i
=0; i
<length
; ++i
) {
365 if(trail
==pairs
[i
].trail
) {
367 "gennorm2 error: same round-trip mapping for "
368 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
369 (long)start
, (long)lead
, (long)trail
);
370 exit(U_INVALID_FORMAT_ERROR
);
372 if(trail
<pairs
[i
].trail
) {
377 compositions
->insertElementAt(trail
, 2*i
, errorCode
);
378 compositions
->insertElementAt(start
, 2*i
+1, errorCode
);
382 UBool
Normalizer2DataBuilder::combinesWithCCBetween(const Norm
&norm
,
383 uint8_t lowCC
, uint8_t highCC
) const {
384 if((highCC
-lowCC
)>=2) {
386 const CompositionPair
*pairs
=norm
.getCompositionPairs(length
);
387 for(int32_t i
=0; i
<length
; ++i
) {
388 uint8_t trailCC
=getCC(pairs
[i
].trail
);
389 if(lowCC
<trailCC
&& trailCC
<highCC
) {
397 UChar32
Normalizer2DataBuilder::combine(const Norm
&norm
, UChar32 trail
) const {
399 const CompositionPair
*pairs
=norm
.getCompositionPairs(length
);
400 for(int32_t i
=0; i
<length
; ++i
) {
401 if(trail
==pairs
[i
].trail
) {
402 return pairs
[i
].composite
;
404 if(trail
<pairs
[i
].trail
) {
411 class Decomposer
: public Normalizer2DBEnumerator
{
413 Decomposer(Normalizer2DataBuilder
&b
) : Normalizer2DBEnumerator(b
), didDecompose(FALSE
) {}
414 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
415 didDecompose
|=builder
.decompose(start
, end
, value
);
422 Normalizer2DataBuilder::decompose(UChar32 start
, UChar32 end
, uint32_t value
) {
423 if(norms
[value
].hasMapping()) {
424 const UnicodeString
&m
=*norms
[value
].mapping
;
425 UnicodeString
*decomposed
=NULL
;
426 const UChar
*s
=m
.getBuffer();
427 int32_t length
=m
.length();
432 U16_NEXT(s
, i
, length
, c
);
433 if(start
<=c
&& c
<=end
) {
435 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
437 exit(U_INVALID_FORMAT_ERROR
);
439 const Norm
&cNorm
=getNormRef(c
);
440 if(cNorm
.hasMapping()) {
441 if(norms
[value
].mappingType
==Norm::ROUND_TRIP
) {
443 if(cNorm
.mappingType
!=Norm::ROUND_TRIP
) {
446 "U+%04lX's round-trip mapping's starter "
447 "U+%04lX one-way-decomposes, "
448 "not possible in Unicode normalization\n",
449 (long)start
, (long)c
);
450 exit(U_INVALID_FORMAT_ERROR
);
452 uint8_t myTrailCC
=getCC(m
.char32At(i
));
453 UChar32 cTrailChar
=cNorm
.mapping
->char32At(cNorm
.mapping
->length()-1);
454 uint8_t cTrailCC
=getCC(cTrailChar
);
455 if(cTrailCC
>myTrailCC
) {
458 "U+%04lX's round-trip mapping's starter "
459 "U+%04lX decomposes and the "
460 "inner/earlier tccc=%hu > outer/following tccc=%hu, "
461 "not possible in Unicode normalization\n",
462 (long)start
, (long)c
,
463 (short)cTrailCC
, (short)myTrailCC
);
464 exit(U_INVALID_FORMAT_ERROR
);
469 "U+%04lX's round-trip mapping's non-starter "
470 "U+%04lX decomposes, "
471 "not possible in Unicode normalization\n",
472 (long)start
, (long)c
);
473 exit(U_INVALID_FORMAT_ERROR
);
476 if(decomposed
==NULL
) {
477 decomposed
=new UnicodeString(m
, 0, prev
);
479 decomposed
->append(*cNorm
.mapping
);
480 } else if(Hangul::isHangul(c
)) {
482 int32_t hangulLength
=Hangul::decompose(c
, buffer
);
483 if(norms
[value
].mappingType
==Norm::ROUND_TRIP
&& prev
!=0) {
486 "U+%04lX's round-trip mapping's non-starter "
487 "U+%04lX decomposes, "
488 "not possible in Unicode normalization\n",
489 (long)start
, (long)c
);
490 exit(U_INVALID_FORMAT_ERROR
);
492 if(decomposed
==NULL
) {
493 decomposed
=new UnicodeString(m
, 0, prev
);
495 decomposed
->append(buffer
, hangulLength
);
496 } else if(decomposed
!=NULL
) {
497 decomposed
->append(m
, prev
, i
-prev
);
500 if(decomposed
!=NULL
) {
501 delete norms
[value
].mapping
;
502 norms
[value
].mapping
=decomposed
;
503 // Not norms[value].setMappingCP(); because the original mapping
504 // is most likely to be encodable as a delta.
511 class BuilderReorderingBuffer
{
513 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE
) {}
516 fLastStarterIndex
=-1;
519 int32_t length() const { return fLength
; }
520 UBool
isEmpty() const { return fLength
==0; }
521 int32_t lastStarterIndex() const { return fLastStarterIndex
; }
522 UChar32
charAt(int32_t i
) const { return fArray
[i
]>>8; }
523 uint8_t ccAt(int32_t i
) const { return (uint8_t)fArray
[i
]; }
524 UBool
didReorder() const { return fDidReorder
; }
525 void append(UChar32 c
, uint8_t cc
) {
526 if(cc
==0 || fLength
==0 || ccAt(fLength
-1)<=cc
) {
528 fLastStarterIndex
=fLength
;
530 fArray
[fLength
++]=(c
<<8)|cc
;
533 // Let this character bubble back to its canonical order.
535 while(i
>fLastStarterIndex
&& ccAt(i
)>cc
) {
538 ++i
; // after the last starter or prevCC<=cc
539 // Move this and the following characters forward one to make space.
540 for(int32_t j
=fLength
; i
<j
; --j
) {
541 fArray
[j
]=fArray
[j
-1];
547 void toString(UnicodeString
&dest
) {
549 for(int32_t i
=0; i
<fLength
; ++i
) {
550 dest
.append(charAt(i
));
553 void setComposite(UChar32 composite
, int32_t combMarkIndex
) {
554 fArray
[fLastStarterIndex
]=composite
<<8;
555 // Remove the combining mark that contributed to the composite.
557 while(combMarkIndex
<fLength
) {
558 fArray
[combMarkIndex
]=fArray
[combMarkIndex
+1];
563 int32_t fArray
[Normalizer2Impl::MAPPING_LENGTH_MASK
];
565 int32_t fLastStarterIndex
;
570 Normalizer2DataBuilder::reorder(Norm
*p
, BuilderReorderingBuffer
&buffer
) {
571 UnicodeString
&m
=*p
->mapping
;
572 int32_t length
=m
.length();
573 if(length
>Normalizer2Impl::MAPPING_LENGTH_MASK
) {
574 return; // writeMapping() will complain about it and print the code point.
576 const UChar
*s
=m
.getBuffer();
580 U16_NEXT(s
, i
, length
, c
);
581 buffer
.append(c
, getCC(c
));
583 if(buffer
.didReorder()) {
588 UBool
Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer
&buffer
) {
589 if(buffer
.isEmpty()) {
590 return TRUE
; // maps-to-empty string is no boundary of any kind
592 int32_t lastStarterIndex
=buffer
.lastStarterIndex();
593 if(lastStarterIndex
<0) {
594 return TRUE
; // no starter
596 UChar32 starter
=buffer
.charAt(lastStarterIndex
);
597 if( Hangul::isJamoL(starter
) ||
598 (Hangul::isJamoV(starter
) &&
599 0<lastStarterIndex
&& Hangul::isJamoL(buffer
.charAt(lastStarterIndex
-1)))
601 // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
602 // otherwise it is blocked.
603 return lastStarterIndex
==buffer
.length()-1;
605 // no Hangul in fully decomposed mapping
606 const Norm
*starterNorm
=&getNormRef(starter
);
607 if(starterNorm
->compositions
==NULL
) {
608 return FALSE
; // the last starter does not combine forward
610 // Compose as far as possible, and see if further compositions are possible.
612 for(int32_t combMarkIndex
=lastStarterIndex
+1; combMarkIndex
<buffer
.length();) {
613 uint8_t cc
=buffer
.ccAt(combMarkIndex
); // !=0 because after last starter
614 if(combinesWithCCBetween(*starterNorm
, prevCC
, cc
)) {
618 (starter
=combine(*starterNorm
, buffer
.charAt(combMarkIndex
)))>=0
620 buffer
.setComposite(starter
, combMarkIndex
);
621 starterNorm
=&getNormRef(starter
);
622 if(starterNorm
->compositions
==NULL
) {
623 return FALSE
; // the composite does not combine further
630 // TRUE if the final, forward-combining starter is at the end.
634 // Requires p->hasMapping().
635 void Normalizer2DataBuilder::writeMapping(UChar32 c
, const Norm
*p
, UnicodeString
&dataString
) {
636 UnicodeString
&m
=*p
->mapping
;
637 int32_t length
=m
.length();
638 if(length
>Normalizer2Impl::MAPPING_LENGTH_MASK
) {
641 "mapping for U+%04lX longer than maximum of %d\n",
642 (long)c
, Normalizer2Impl::MAPPING_LENGTH_MASK
);
643 exit(U_INVALID_FORMAT_ERROR
);
645 int32_t leadCC
, trailCC
;
649 leadCC
=getCC(m
.char32At(0));
650 trailCC
=getCC(m
.char32At(length
-1));
652 if(c
<Normalizer2Impl::MIN_CCC_LCCC_CP
&& (p
->cc
!=0 || leadCC
!=0)) {
655 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
657 exit(U_INVALID_FORMAT_ERROR
);
659 int32_t firstUnit
=length
|(trailCC
<<8);
660 int32_t secondUnit
=p
->cc
|(leadCC
<<8);
662 firstUnit
|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD
;
664 if(p
->compositions
!=NULL
) {
665 firstUnit
|=Normalizer2Impl::MAPPING_PLUS_COMPOSITION_LIST
;
667 if(p
->hasNoCompBoundaryAfter
) {
668 firstUnit
|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER
;
670 dataString
.append((UChar
)firstUnit
);
672 dataString
.append((UChar
)secondUnit
);
674 dataString
.append(m
);
677 // Requires p->compositions!=NULL.
678 void Normalizer2DataBuilder::writeCompositions(UChar32 c
, const Norm
*p
, UnicodeString
&dataString
) {
682 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
684 exit(U_INVALID_FORMAT_ERROR
);
687 const CompositionPair
*pairs
=p
->getCompositionPairs(length
);
688 for(int32_t i
=0; i
<length
; ++i
) {
689 const CompositionPair
&pair
=pairs
[i
];
690 // 22 bits for the composite character and whether it combines forward.
691 UChar32 compositeAndFwd
=pair
.composite
<<1;
692 if(getNormRef(pair
.composite
).compositions
!=NULL
) {
693 compositeAndFwd
|=1; // The composite character also combines-forward.
695 // Encode most pairs in two units and some in three.
696 int32_t firstUnit
, secondUnit
, thirdUnit
;
697 if(pair
.trail
<Normalizer2Impl::COMP_1_TRAIL_LIMIT
) {
698 if(compositeAndFwd
<=0xffff) {
699 firstUnit
=pair
.trail
<<1;
700 secondUnit
=compositeAndFwd
;
703 firstUnit
=(pair
.trail
<<1)|Normalizer2Impl::COMP_1_TRIPLE
;
704 secondUnit
=compositeAndFwd
>>16;
705 thirdUnit
=compositeAndFwd
;
708 firstUnit
=(Normalizer2Impl::COMP_1_TRAIL_LIMIT
+
709 (pair
.trail
>>Normalizer2Impl::COMP_1_TRAIL_SHIFT
))|
710 Normalizer2Impl::COMP_1_TRIPLE
;
711 secondUnit
=(pair
.trail
<<Normalizer2Impl::COMP_2_TRAIL_SHIFT
)|
712 (compositeAndFwd
>>16);
713 thirdUnit
=compositeAndFwd
;
715 // Set the high bit of the first unit if this is the last composition pair.
717 firstUnit
|=Normalizer2Impl::COMP_1_LAST_TUPLE
;
719 dataString
.append((UChar
)firstUnit
).append((UChar
)secondUnit
);
721 dataString
.append((UChar
)thirdUnit
);
726 class ExtraDataWriter
: public Normalizer2DBEnumerator
{
728 ExtraDataWriter(Normalizer2DataBuilder
&b
) :
729 Normalizer2DBEnumerator(b
),
730 yesYesCompositions(1000, (UChar32
)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
731 yesNoData(1000, (UChar32
)0, 1) {} // 0=Hangul, 1=start of normal data
732 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
736 "gennorm2 error: unexpected shared data for "
737 "multiple code points U+%04lX..U+%04lX\n",
738 (long)start
, (long)end
);
739 exit(U_INTERNAL_PROGRAM_ERROR
);
741 builder
.writeExtraData(start
, value
, *this);
745 UnicodeString maybeYesCompositions
;
746 UnicodeString yesYesCompositions
;
747 UnicodeString yesNoData
;
748 UnicodeString noNoMappings
;
749 Hashtable previousNoNoMappings
; // If constructed in runtime code, pass in UErrorCode.
752 void Normalizer2DataBuilder::writeExtraData(UChar32 c
, uint32_t value
, ExtraDataWriter
&writer
) {
754 if(p
->combinesBack
) {
755 if(p
->hasMapping()) {
758 "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
760 exit(U_INVALID_FORMAT_ERROR
);
762 if(p
->compositions
!=NULL
) {
764 (writer
.maybeYesCompositions
.length()<<Norm::OFFSET_SHIFT
)|
765 Norm::OFFSET_MAYBE_YES
;
766 writeCompositions(c
, p
, writer
.maybeYesCompositions
);
768 } else if(!p
->hasMapping()) {
769 if(p
->compositions
!=NULL
) {
771 (writer
.yesYesCompositions
.length()<<Norm::OFFSET_SHIFT
)|
772 Norm::OFFSET_YES_YES
;
773 writeCompositions(c
, p
, writer
.yesYesCompositions
);
775 } else if(p
->mappingType
==Norm::ROUND_TRIP
) {
777 (writer
.yesNoData
.length()<<Norm::OFFSET_SHIFT
)|
779 writeMapping(c
, p
, writer
.yesNoData
);
780 if(p
->compositions
!=NULL
) {
781 writeCompositions(c
, p
, writer
.yesNoData
);
783 } else /* one-way */ {
784 if(p
->compositions
!=NULL
) {
787 "U+%04lX combines-forward and has a one-way mapping, "
788 "not possible in Unicode normalization\n",
790 exit(U_INVALID_FORMAT_ERROR
);
792 if(p
->cc
==0 && optimization
!=OPTIMIZE_FAST
) {
793 // Try a compact, algorithmic encoding.
794 // Only for ccc=0, because we can't store additional information.
795 if(p
->mappingCP
>=0) {
796 int32_t delta
=p
->mappingCP
-c
;
797 if(-Normalizer2Impl::MAX_DELTA
<=delta
&& delta
<=Normalizer2Impl::MAX_DELTA
) {
798 p
->offset
=(delta
<<Norm::OFFSET_SHIFT
)|Norm::OFFSET_DELTA
;
803 int32_t oldNoNoLength
=writer
.noNoMappings
.length();
804 writeMapping(c
, p
, writer
.noNoMappings
);
805 UnicodeString newMapping
=writer
.noNoMappings
.tempSubString(oldNoNoLength
);
806 int32_t previousOffset
=writer
.previousNoNoMappings
.geti(newMapping
);
807 if(previousOffset
!=0) {
808 // Duplicate, remove the new units and point to the old ones.
809 writer
.noNoMappings
.truncate(oldNoNoLength
);
811 ((previousOffset
-1)<<Norm::OFFSET_SHIFT
)|
814 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
815 IcuToolErrorCode
errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
816 writer
.previousNoNoMappings
.puti(newMapping
, oldNoNoLength
+1, errorCode
);
818 (oldNoNoLength
<<Norm::OFFSET_SHIFT
)|
825 class Norm16Writer
: public Normalizer2DBEnumerator
{
827 Norm16Writer(Normalizer2DataBuilder
&b
) : Normalizer2DBEnumerator(b
) {}
828 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
829 builder
.writeNorm16(start
, end
, value
);
834 void Normalizer2DataBuilder::writeNorm16(UChar32 start
, UChar32 end
, uint32_t value
) {
836 const Norm
*p
=norms
+value
;
837 int32_t offset
=p
->offset
>>Norm::OFFSET_SHIFT
;
839 UBool isDecompNo
=FALSE
;
840 UBool isCompNoMaybe
=FALSE
;
841 switch(p
->offset
&Norm::OFFSET_MASK
) {
842 case Norm::OFFSET_NONE
:
843 // No mapping, no compositions list.
844 if(p
->combinesBack
) {
845 norm16
=Normalizer2Impl::MIN_NORMAL_MAYBE_YES
+p
->cc
;
846 isDecompNo
=(UBool
)(p
->cc
!=0);
848 } else if(p
->cc
!=0) {
849 norm16
=Normalizer2Impl::MIN_YES_YES_WITH_CC
-1+p
->cc
;
850 isDecompNo
=isCompNoMaybe
=TRUE
;
853 case Norm::OFFSET_MAYBE_YES
:
854 norm16
=indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]+offset
;
857 case Norm::OFFSET_YES_YES
:
860 case Norm::OFFSET_YES_NO
:
861 norm16
=indexes
[Normalizer2Impl::IX_MIN_YES_NO
]+offset
;
864 case Norm::OFFSET_NO_NO
:
865 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO
]+offset
;
866 isDecompNo
=isCompNoMaybe
=TRUE
;
868 case Norm::OFFSET_DELTA
:
869 norm16
=getCenterNoNoDelta()+offset
;
870 isDecompNo
=isCompNoMaybe
=TRUE
;
872 default: // Should not occur.
873 exit(U_INTERNAL_PROGRAM_ERROR
);
875 IcuToolErrorCode
errorCode("gennorm2/writeNorm16()");
876 utrie2_setRange32(norm16Trie
, start
, end
, (uint32_t)norm16
, TRUE
, errorCode
);
877 if(isDecompNo
&& start
<indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]) {
878 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=start
;
880 if(isCompNoMaybe
&& start
<indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]) {
881 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=start
;
886 void Normalizer2DataBuilder::setHangulData() {
888 const HangulIterator::Range
*range
;
889 // Check that none of the Hangul/Jamo code points have data.
890 while((range
=hi
.nextRange())!=NULL
) {
891 for(UChar32 c
=range
->start
; c
<range
->limit
; ++c
) {
892 if(utrie2_get32(norm16Trie
, c
)!=0) {
895 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
897 exit(U_INVALID_FORMAT_ERROR
);
901 // Set data for algorithmic runtime handling.
902 IcuToolErrorCode
errorCode("gennorm2/setHangulData()");
904 while((range
=hi
.nextRange())!=NULL
) {
905 uint16_t norm16
=range
->norm16
;
907 norm16
=(uint16_t)indexes
[Normalizer2Impl::IX_MIN_YES_NO
]; // Hangul LV/LVT encoded as minYesNo
908 if(range
->start
<indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]) {
909 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=range
->start
;
912 if(range
->start
<indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]) { // Jamo V/T are maybeYes
913 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=range
->start
;
916 utrie2_setRange32(norm16Trie
, range
->start
, range
->limit
-1, norm16
, TRUE
, errorCode
);
917 errorCode
.assertSuccess();
923 static UBool U_CALLCONV
924 enumRangeMaxValue(const void *context
, UChar32
/*start*/, UChar32
/*end*/, uint32_t value
) {
925 uint32_t *pMaxValue
=(uint32_t *)context
;
926 if(value
>*pMaxValue
) {
934 void Normalizer2DataBuilder::processData() {
935 IcuToolErrorCode
errorCode("gennorm2/processData()");
936 norm16Trie
=utrie2_open(0, 0, errorCode
);
937 errorCode
.assertSuccess();
939 utrie2_enum(normTrie
, NULL
, enumRangeHandler
, CompositionBuilder(*this).ptr());
941 Decomposer
decomposer(*this);
943 decomposer
.didDecompose
=FALSE
;
944 utrie2_enum(normTrie
, NULL
, enumRangeHandler
, &decomposer
);
945 } while(decomposer
.didDecompose
);
947 BuilderReorderingBuffer buffer
;
948 int32_t normsLength
=utm_countItems(normMem
);
949 for(int32_t i
=1; i
<normsLength
; ++i
) {
950 if(norms
[i
].hasMapping()) {
952 reorder(norms
+i
, buffer
);
953 norms
[i
].hasNoCompBoundaryAfter
=hasNoCompBoundaryAfter(buffer
);
957 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=0x110000;
958 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=0x110000;
960 ExtraDataWriter
extraDataWriter(*this);
961 utrie2_enum(normTrie
, NULL
, enumRangeHandler
, &extraDataWriter
);
963 extraData
=extraDataWriter
.maybeYesCompositions
;
964 extraData
.append(extraDataWriter
.yesYesCompositions
).
965 append(extraDataWriter
.yesNoData
).
966 append(extraDataWriter
.noNoMappings
);
967 // Pad to even length for 4-byte alignment of following data.
968 if(extraData
.length()&1) {
969 extraData
.append((UChar
)0);
972 indexes
[Normalizer2Impl::IX_MIN_YES_NO
]=
973 extraDataWriter
.yesYesCompositions
.length();
974 indexes
[Normalizer2Impl::IX_MIN_NO_NO
]=
975 indexes
[Normalizer2Impl::IX_MIN_YES_NO
]+
976 extraDataWriter
.yesNoData
.length();
977 indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]=
978 indexes
[Normalizer2Impl::IX_MIN_NO_NO
]+
979 extraDataWriter
.noNoMappings
.length();
980 indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]=
981 Normalizer2Impl::MIN_NORMAL_MAYBE_YES
-
982 extraDataWriter
.maybeYesCompositions
.length();
984 int32_t minNoNoDelta
=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA
;
985 if(indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]>minNoNoDelta
) {
988 "data structure overflow, too much mapping composition data\n");
989 exit(U_BUFFER_OVERFLOW_ERROR
);
992 utrie2_enum(normTrie
, NULL
, enumRangeHandler
, Norm16Writer(*this).ptr());
996 // Look for the "worst" norm16 value of any supplementary code point
997 // corresponding to a lead surrogate, and set it as that surrogate's value.
998 // Enables quick check inner loops to look at only code units.
1000 // We could be more sophisticated:
1001 // We could collect a bit set for whether there are values in the different
1002 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
1003 // and select the best value that only breaks the composition and/or decomposition
1004 // inner loops if necessary.
1005 // However, that seems like overkill for an optimization for supplementary characters.
1006 for(UChar lead
=0xd800; lead
<0xdc00; ++lead
) {
1007 uint32_t maxValue
=utrie2_get32(norm16Trie
, lead
);
1008 utrie2_enumForLeadSurrogate(norm16Trie
, lead
, NULL
, enumRangeMaxValue
, &maxValue
);
1009 if( maxValue
>=(uint32_t)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
] &&
1010 maxValue
>(uint32_t)indexes
[Normalizer2Impl::IX_MIN_NO_NO
]
1012 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
1013 // Otherwise it might end up at something like JAMO_VT which stays in
1014 // the inner decomposition quick check loop.
1015 maxValue
=(uint32_t)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]-1;
1017 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie
, lead
, maxValue
, errorCode
);
1020 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
1021 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
1022 // which is harmless.
1023 // As a result, the minimum code points are always BMP code points.
1024 int32_t minCP
=indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
];
1025 if(minCP
>=0x10000) {
1026 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=U16_LEAD(minCP
);
1028 minCP
=indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
];
1029 if(minCP
>=0x10000) {
1030 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=U16_LEAD(minCP
);
1034 void Normalizer2DataBuilder::writeBinaryFile(const char *filename
) {
1037 IcuToolErrorCode
errorCode("gennorm2/writeBinaryFile()");
1038 utrie2_freeze(norm16Trie
, UTRIE2_16_VALUE_BITS
, errorCode
);
1039 int32_t norm16TrieLength
=utrie2_serialize(norm16Trie
, NULL
, 0, errorCode
);
1040 if(errorCode
.get()!=U_BUFFER_OVERFLOW_ERROR
) {
1041 fprintf(stderr
, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
1042 errorCode
.errorName());
1043 exit(errorCode
.reset());
1046 LocalArray
<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength
]);
1047 utrie2_serialize(norm16Trie
, norm16TrieBytes
.getAlias(), norm16TrieLength
, errorCode
);
1048 errorCode
.assertSuccess();
1050 int32_t offset
=(int32_t)sizeof(indexes
);
1051 indexes
[Normalizer2Impl::IX_NORM_TRIE_OFFSET
]=offset
;
1052 offset
+=norm16TrieLength
;
1053 indexes
[Normalizer2Impl::IX_EXTRA_DATA_OFFSET
]=offset
;
1054 int32_t totalSize
=offset
+=extraData
.length()*2;
1055 for(int32_t i
=Normalizer2Impl::IX_RESERVED2_OFFSET
; i
<=Normalizer2Impl::IX_TOTAL_SIZE
; ++i
) {
1056 indexes
[i
]=totalSize
;
1060 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength
);
1061 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData
.length());
1062 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize
);
1063 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]);
1064 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]);
1065 printf("minYesNo: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_YES_NO
]);
1066 printf("minNoNo: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO
]);
1067 printf("limitNoNo: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]);
1068 printf("minMaybeYes: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]);
1071 memcpy(dataInfo
.dataVersion
, unicodeVersion
, 4);
1072 UNewDataMemory
*pData
=
1073 udata_create(NULL
, NULL
, filename
, &dataInfo
,
1074 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, errorCode
);
1075 if(errorCode
.isFailure()) {
1076 fprintf(stderr
, "gennorm2 error: unable to create the output file %s - %s\n",
1077 filename
, errorCode
.errorName());
1078 exit(errorCode
.reset());
1080 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
1081 udata_writeBlock(pData
, norm16TrieBytes
.getAlias(), norm16TrieLength
);
1082 udata_writeUString(pData
, extraData
.getBuffer(), extraData
.length());
1084 int32_t writtenSize
=udata_finish(pData
, errorCode
);
1085 if(errorCode
.isFailure()) {
1086 fprintf(stderr
, "gennorm2: error %s writing the output file\n", errorCode
.errorName());
1087 exit(errorCode
.reset());
1089 if(writtenSize
!=totalSize
) {
1090 fprintf(stderr
, "gennorm2 error: written size %ld != calculated size %ld\n",
1091 (long)writtenSize
, (long)totalSize
);
1092 exit(U_INTERNAL_PROGRAM_ERROR
);
1098 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1101 * Hey, Emacs, please set the following:
1104 * indent-tabs-mode: nil