2 *******************************************************************************
4 * Copyright (C) 2009-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: n2builder.cpp
10 * tab size: 8 (not used)
13 * created on: 2009nov25
14 * created by: Markus W. Scherer
16 * Builds Normalizer2 data and writes a binary .nrm file.
17 * For the file format see source/common/normalizer2impl.h.
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
29 #include "unicode/errorcode.h"
30 #include "unicode/localpointer.h"
31 #include "unicode/putil.h"
32 #include "unicode/udata.h"
33 #include "unicode/uniset.h"
34 #include "unicode/unistr.h"
35 #include "unicode/ustring.h"
38 #include "normalizer2impl.h"
45 #if !UCONFIG_NO_NORMALIZATION
47 /* UDataInfo cf. udata.h */
48 static UDataInfo dataInfo
={
57 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
58 { 2, 0, 0, 0 }, /* formatVersion */
59 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */
64 class HangulIterator
{
71 HangulIterator() : rangeIndex(0) {}
72 const Range
*nextRange() {
73 if(rangeIndex
<UPRV_LENGTHOF(ranges
)) {
74 return ranges
+rangeIndex
++;
79 void reset() { rangeIndex
=0; }
81 static const Range ranges
[4];
85 const HangulIterator::Range
HangulIterator::ranges
[4]={
86 { Hangul::JAMO_L_BASE
, Hangul::JAMO_L_BASE
+Hangul::JAMO_L_COUNT
, 1 },
87 { Hangul::JAMO_V_BASE
, Hangul::JAMO_V_BASE
+Hangul::JAMO_V_COUNT
, Normalizer2Impl::JAMO_VT
},
88 // JAMO_T_BASE+1: not U+11A7
89 { Hangul::JAMO_T_BASE
+1, Hangul::JAMO_T_BASE
+Hangul::JAMO_T_COUNT
, Normalizer2Impl::JAMO_VT
},
90 { Hangul::HANGUL_BASE
, Hangul::HANGUL_BASE
+Hangul::HANGUL_COUNT
, 0 }, // will become minYesNo
93 struct CompositionPair
{
94 CompositionPair(UChar32 t
, UChar32 c
) : trail(t
), composite(c
) {}
95 UChar32 trail
, composite
;
99 enum MappingType
{ NONE
, REMOVED
, ROUND_TRIP
, ONE_WAY
};
101 UBool
hasMapping() const { return mappingType
>REMOVED
; }
103 // Requires hasMapping() and well-formed mapping.
104 void setMappingCP() {
106 if(!mapping
->isEmpty() && mapping
->length()==U16_LENGTH(c
=mapping
->char32At(0))) {
109 mappingCP
=U_SENTINEL
;
113 const CompositionPair
*getCompositionPairs(int32_t &length
) const {
114 if(compositions
==NULL
) {
118 length
=compositions
->size()/2;
119 return reinterpret_cast<const CompositionPair
*>(compositions
->getBuffer());
123 UnicodeString
*mapping
;
124 UnicodeString
*rawMapping
; // non-NULL if the mapping is further decomposed
125 UChar32 mappingCP
; // >=0 if mapping to 1 code point
126 int32_t mappingPhase
;
127 MappingType mappingType
;
129 UVector32
*compositions
; // (trail, composite) pairs
132 UBool hasNoCompBoundaryAfter
;
136 // Composition for back-combining character. Allowed, but not normally used.
138 // Composition for a starter that does not have a decomposition mapping.
140 // Round-trip mapping & composition for a starter.
141 OFFSET_YES_NO_MAPPING_AND_COMPOSITION
,
142 // Round-trip mapping for a starter that itself does not combine-forward.
143 OFFSET_YES_NO_MAPPING_ONLY
,
146 // Delta for an algorithmic one-way mapping.
149 enum { OFFSET_SHIFT
=4, OFFSET_MASK
=(1<<OFFSET_SHIFT
)-1 };
153 class Normalizer2DBEnumerator
{
155 Normalizer2DBEnumerator(Normalizer2DataBuilder
&b
) : builder(b
) {}
156 virtual ~Normalizer2DBEnumerator() {}
157 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) = 0;
158 Normalizer2DBEnumerator
*ptr() { return this; }
160 Normalizer2DataBuilder
&builder
;
165 static UBool U_CALLCONV
166 enumRangeHandler(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
167 return ((Normalizer2DBEnumerator
*)context
)->rangeHandler(start
, end
, value
);
172 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode
&errorCode
) :
173 phase(0), overrideHandling(OVERRIDE_PREVIOUS
), optimization(OPTIMIZE_NORMAL
),
174 norm16TrieLength(0) {
175 memset(unicodeVersion
, 0, sizeof(unicodeVersion
));
176 normTrie
=utrie2_open(0, 0, &errorCode
);
177 normMem
=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm
));
178 norms
=allocNorm(); // unused Norm struct at index 0
179 memset(indexes
, 0, sizeof(indexes
));
180 memset(smallFCD
, 0, sizeof(smallFCD
));
183 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
184 utrie2_close(normTrie
);
185 int32_t normsLength
=utm_countItems(normMem
);
186 for(int32_t i
=1; i
<normsLength
; ++i
) {
187 delete norms
[i
].mapping
;
188 delete norms
[i
].rawMapping
;
189 delete norms
[i
].compositions
;
192 utrie2_close(norm16Trie
);
196 Normalizer2DataBuilder::setUnicodeVersion(const char *v
) {
197 UVersionInfo nullVersion
={ 0, 0, 0, 0 };
198 UVersionInfo version
;
199 u_versionFromString(version
, v
);
200 if( 0!=memcmp(version
, unicodeVersion
, U_MAX_VERSION_LENGTH
) &&
201 0!=memcmp(nullVersion
, unicodeVersion
, U_MAX_VERSION_LENGTH
)
203 char buffer
[U_MAX_VERSION_STRING_LENGTH
];
204 u_versionToString(unicodeVersion
, buffer
);
205 fprintf(stderr
, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
207 exit(U_ILLEGAL_ARGUMENT_ERROR
);
209 memcpy(unicodeVersion
, version
, U_MAX_VERSION_LENGTH
);
212 Norm
*Normalizer2DataBuilder::allocNorm() {
213 Norm
*p
=(Norm
*)utm_alloc(normMem
);
214 norms
=(Norm
*)utm_getStart(normMem
); // in case it got reallocated
218 /* get an existing Norm unit */
219 Norm
*Normalizer2DataBuilder::getNorm(UChar32 c
) {
220 uint32_t i
=utrie2_get32(normTrie
, c
);
227 const Norm
&Normalizer2DataBuilder::getNormRef(UChar32 c
) const {
228 return norms
[utrie2_get32(normTrie
, c
)];
232 * get or create a Norm unit;
233 * get or create the intermediate trie entries for it as well
235 Norm
*Normalizer2DataBuilder::createNorm(UChar32 c
) {
236 uint32_t i
=utrie2_get32(normTrie
, c
);
242 IcuToolErrorCode
errorCode("gennorm2/createNorm()");
243 utrie2_set32(normTrie
, c
, (uint32_t)(p
-norms
), errorCode
);
248 Norm
*Normalizer2DataBuilder::checkNormForMapping(Norm
*p
, UChar32 c
) {
250 if(p
->mappingType
!=Norm::NONE
) {
251 if( overrideHandling
==OVERRIDE_NONE
||
252 (overrideHandling
==OVERRIDE_PREVIOUS
&& p
->mappingPhase
==phase
)
255 "error in gennorm2 phase %d: "
256 "not permitted to override mapping for U+%04lX from phase %d\n",
257 (int)phase
, (long)c
, (int)p
->mappingPhase
);
258 exit(U_INVALID_FORMAT_ERROR
);
263 p
->mappingPhase
=phase
;
268 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh
) {
273 void Normalizer2DataBuilder::setCC(UChar32 c
, uint8_t cc
) {
274 createNorm(c
)->cc
=cc
;
277 uint8_t Normalizer2DataBuilder::getCC(UChar32 c
) const {
278 return getNormRef(c
).cc
;
281 static UBool
isWellFormed(const UnicodeString
&s
) {
282 UErrorCode errorCode
=U_ZERO_ERROR
;
283 u_strToUTF8(NULL
, 0, NULL
, s
.getBuffer(), s
.length(), &errorCode
);
284 return U_SUCCESS(errorCode
) || errorCode
==U_BUFFER_OVERFLOW_ERROR
;
287 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c
, const UnicodeString
&m
) {
288 if(!isWellFormed(m
)) {
290 "error in gennorm2 phase %d: "
291 "illegal one-way mapping from U+%04lX to malformed string\n",
292 (int)phase
, (long)c
);
293 exit(U_INVALID_FORMAT_ERROR
);
295 Norm
*p
=checkNormForMapping(createNorm(c
), c
);
296 p
->mapping
=new UnicodeString(m
);
297 p
->mappingType
=Norm::ONE_WAY
;
301 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c
, const UnicodeString
&m
) {
302 if(U_IS_SURROGATE(c
)) {
304 "error in gennorm2 phase %d: "
305 "illegal round-trip mapping from surrogate code point U+%04lX\n",
306 (int)phase
, (long)c
);
307 exit(U_INVALID_FORMAT_ERROR
);
309 if(!isWellFormed(m
)) {
311 "error in gennorm2 phase %d: "
312 "illegal round-trip mapping from U+%04lX to malformed string\n",
313 (int)phase
, (long)c
);
314 exit(U_INVALID_FORMAT_ERROR
);
316 int32_t numCP
=u_countChar32(m
.getBuffer(), m
.length());
319 "error in gennorm2 phase %d: "
320 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
321 (int)phase
, (long)c
, (int)numCP
);
322 exit(U_INVALID_FORMAT_ERROR
);
324 Norm
*p
=checkNormForMapping(createNorm(c
), c
);
325 p
->mapping
=new UnicodeString(m
);
326 p
->mappingType
=Norm::ROUND_TRIP
;
327 p
->mappingCP
=U_SENTINEL
;
330 void Normalizer2DataBuilder::removeMapping(UChar32 c
) {
331 Norm
*p
=checkNormForMapping(getNorm(c
), c
);
333 p
->mappingType
=Norm::REMOVED
;
337 class CompositionBuilder
: public Normalizer2DBEnumerator
{
339 CompositionBuilder(Normalizer2DataBuilder
&b
) : Normalizer2DBEnumerator(b
) {}
340 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
341 builder
.addComposition(start
, end
, value
);
347 Normalizer2DataBuilder::addComposition(UChar32 start
, UChar32 end
, uint32_t value
) {
348 if(norms
[value
].mappingType
==Norm::ROUND_TRIP
) {
351 "gennorm2 error: same round-trip mapping for "
352 "more than 1 code point U+%04lX..U+%04lX\n",
353 (long)start
, (long)end
);
354 exit(U_INVALID_FORMAT_ERROR
);
356 if(norms
[value
].cc
!=0) {
359 "U+%04lX has a round-trip mapping and ccc!=0, "
360 "not possible in Unicode normalization\n",
362 exit(U_INVALID_FORMAT_ERROR
);
364 // setRoundTripMapping() ensured that there are exactly two code points.
365 const UnicodeString
&m
=*norms
[value
].mapping
;
366 UChar32 lead
=m
.char32At(0);
367 UChar32 trail
=m
.char32At(m
.length()-1);
371 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
372 "not possible in Unicode normalization\n",
373 (long)start
, (long)lead
);
374 exit(U_INVALID_FORMAT_ERROR
);
376 // Flag for trailing character.
377 createNorm(trail
)->combinesBack
=TRUE
;
378 // Insert (trail, composite) pair into compositions list for the lead character.
379 IcuToolErrorCode
errorCode("gennorm2/addComposition()");
380 Norm
*leadNorm
=createNorm(lead
);
381 UVector32
*compositions
=leadNorm
->compositions
;
383 if(compositions
==NULL
) {
384 compositions
=leadNorm
->compositions
=new UVector32(errorCode
);
385 i
=0; // "insert" the first pair at index 0
387 // Insertion sort, and check for duplicate trail characters.
389 const CompositionPair
*pairs
=leadNorm
->getCompositionPairs(length
);
390 for(i
=0; i
<length
; ++i
) {
391 if(trail
==pairs
[i
].trail
) {
393 "gennorm2 error: same round-trip mapping for "
394 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
395 (long)start
, (long)lead
, (long)trail
);
396 exit(U_INVALID_FORMAT_ERROR
);
398 if(trail
<pairs
[i
].trail
) {
403 compositions
->insertElementAt(trail
, 2*i
, errorCode
);
404 compositions
->insertElementAt(start
, 2*i
+1, errorCode
);
408 UBool
Normalizer2DataBuilder::combinesWithCCBetween(const Norm
&norm
,
409 uint8_t lowCC
, uint8_t highCC
) const {
410 if((highCC
-lowCC
)>=2) {
412 const CompositionPair
*pairs
=norm
.getCompositionPairs(length
);
413 for(int32_t i
=0; i
<length
; ++i
) {
414 uint8_t trailCC
=getCC(pairs
[i
].trail
);
415 if(lowCC
<trailCC
&& trailCC
<highCC
) {
423 UChar32
Normalizer2DataBuilder::combine(const Norm
&norm
, UChar32 trail
) const {
425 const CompositionPair
*pairs
=norm
.getCompositionPairs(length
);
426 for(int32_t i
=0; i
<length
; ++i
) {
427 if(trail
==pairs
[i
].trail
) {
428 return pairs
[i
].composite
;
430 if(trail
<pairs
[i
].trail
) {
437 class Decomposer
: public Normalizer2DBEnumerator
{
439 Decomposer(Normalizer2DataBuilder
&b
) : Normalizer2DBEnumerator(b
), didDecompose(FALSE
) {}
440 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
441 didDecompose
|=builder
.decompose(start
, end
, value
);
448 Normalizer2DataBuilder::decompose(UChar32 start
, UChar32 end
, uint32_t value
) {
449 if(norms
[value
].hasMapping()) {
450 Norm
&norm
=norms
[value
];
451 const UnicodeString
&m
=*norm
.mapping
;
452 UnicodeString
*decomposed
=NULL
;
453 const UChar
*s
=m
.getBuffer();
454 int32_t length
=m
.length();
459 U16_NEXT(s
, i
, length
, c
);
460 if(start
<=c
&& c
<=end
) {
462 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
464 exit(U_INVALID_FORMAT_ERROR
);
466 const Norm
&cNorm
=getNormRef(c
);
467 if(cNorm
.hasMapping()) {
468 if(norm
.mappingType
==Norm::ROUND_TRIP
) {
470 if(cNorm
.mappingType
!=Norm::ROUND_TRIP
) {
473 "U+%04lX's round-trip mapping's starter "
474 "U+%04lX one-way-decomposes, "
475 "not possible in Unicode normalization\n",
476 (long)start
, (long)c
);
477 exit(U_INVALID_FORMAT_ERROR
);
479 uint8_t myTrailCC
=getCC(m
.char32At(i
));
480 UChar32 cTrailChar
=cNorm
.mapping
->char32At(cNorm
.mapping
->length()-1);
481 uint8_t cTrailCC
=getCC(cTrailChar
);
482 if(cTrailCC
>myTrailCC
) {
485 "U+%04lX's round-trip mapping's starter "
486 "U+%04lX decomposes and the "
487 "inner/earlier tccc=%hu > outer/following tccc=%hu, "
488 "not possible in Unicode normalization\n",
489 (long)start
, (long)c
,
490 (short)cTrailCC
, (short)myTrailCC
);
491 exit(U_INVALID_FORMAT_ERROR
);
496 "U+%04lX's round-trip mapping's non-starter "
497 "U+%04lX decomposes, "
498 "not possible in Unicode normalization\n",
499 (long)start
, (long)c
);
500 exit(U_INVALID_FORMAT_ERROR
);
503 if(decomposed
==NULL
) {
504 decomposed
=new UnicodeString(m
, 0, prev
);
506 decomposed
->append(*cNorm
.mapping
);
507 } else if(Hangul::isHangul(c
)) {
509 int32_t hangulLength
=Hangul::decompose(c
, buffer
);
510 if(norm
.mappingType
==Norm::ROUND_TRIP
&& prev
!=0) {
513 "U+%04lX's round-trip mapping's non-starter "
514 "U+%04lX decomposes, "
515 "not possible in Unicode normalization\n",
516 (long)start
, (long)c
);
517 exit(U_INVALID_FORMAT_ERROR
);
519 if(decomposed
==NULL
) {
520 decomposed
=new UnicodeString(m
, 0, prev
);
522 decomposed
->append(buffer
, hangulLength
);
523 } else if(decomposed
!=NULL
) {
524 decomposed
->append(m
, prev
, i
-prev
);
527 if(decomposed
!=NULL
) {
528 if(norm
.rawMapping
==NULL
) {
529 // Remember the original mapping when decomposing recursively.
530 norm
.rawMapping
=norm
.mapping
;
534 norm
.mapping
=decomposed
;
535 // Not norm.setMappingCP(); because the original mapping
536 // is most likely to be encodable as a delta.
543 class BuilderReorderingBuffer
{
545 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE
) {}
548 fLastStarterIndex
=-1;
551 int32_t length() const { return fLength
; }
552 UBool
isEmpty() const { return fLength
==0; }
553 int32_t lastStarterIndex() const { return fLastStarterIndex
; }
554 UChar32
charAt(int32_t i
) const { return fArray
[i
]>>8; }
555 uint8_t ccAt(int32_t i
) const { return (uint8_t)fArray
[i
]; }
556 UBool
didReorder() const { return fDidReorder
; }
557 void append(UChar32 c
, uint8_t cc
) {
558 if(cc
==0 || fLength
==0 || ccAt(fLength
-1)<=cc
) {
560 fLastStarterIndex
=fLength
;
562 fArray
[fLength
++]=(c
<<8)|cc
;
565 // Let this character bubble back to its canonical order.
567 while(i
>fLastStarterIndex
&& ccAt(i
)>cc
) {
570 ++i
; // after the last starter or prevCC<=cc
571 // Move this and the following characters forward one to make space.
572 for(int32_t j
=fLength
; i
<j
; --j
) {
573 fArray
[j
]=fArray
[j
-1];
579 void toString(UnicodeString
&dest
) {
581 for(int32_t i
=0; i
<fLength
; ++i
) {
582 dest
.append(charAt(i
));
585 void setComposite(UChar32 composite
, int32_t combMarkIndex
) {
586 fArray
[fLastStarterIndex
]=composite
<<8;
587 // Remove the combining mark that contributed to the composite.
589 while(combMarkIndex
<fLength
) {
590 fArray
[combMarkIndex
]=fArray
[combMarkIndex
+1];
595 int32_t fArray
[Normalizer2Impl::MAPPING_LENGTH_MASK
];
597 int32_t fLastStarterIndex
;
602 Normalizer2DataBuilder::reorder(Norm
*p
, BuilderReorderingBuffer
&buffer
) {
603 UnicodeString
&m
=*p
->mapping
;
604 int32_t length
=m
.length();
605 if(length
>Normalizer2Impl::MAPPING_LENGTH_MASK
) {
606 return; // writeMapping() will complain about it and print the code point.
608 const UChar
*s
=m
.getBuffer();
612 U16_NEXT(s
, i
, length
, c
);
613 buffer
.append(c
, getCC(c
));
615 if(buffer
.didReorder()) {
621 * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
622 * A starter character with a mapping does not have a composition boundary after it
623 * if the character itself combines-forward (which is tested by the caller of this function),
624 * or it is deleted (mapped to the empty string),
625 * or its mapping contains no starter,
626 * or the last starter combines-forward.
628 UBool
Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer
&buffer
) {
629 if(buffer
.isEmpty()) {
630 return TRUE
; // maps-to-empty-string is no boundary of any kind
632 int32_t lastStarterIndex
=buffer
.lastStarterIndex();
633 if(lastStarterIndex
<0) {
634 return TRUE
; // no starter
636 UChar32 starter
=buffer
.charAt(lastStarterIndex
);
637 if( Hangul::isJamoL(starter
) ||
638 (Hangul::isJamoV(starter
) &&
639 0<lastStarterIndex
&& Hangul::isJamoL(buffer
.charAt(lastStarterIndex
-1)))
641 // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
642 // otherwise it is blocked.
643 return lastStarterIndex
==buffer
.length()-1;
645 // Note: There can be no Hangul syllable in the fully decomposed mapping.
646 const Norm
*starterNorm
=&getNormRef(starter
);
647 if(starterNorm
->compositions
==NULL
) {
648 return FALSE
; // the last starter does not combine forward
650 // Compose as far as possible, and see if further compositions are possible.
652 for(int32_t combMarkIndex
=lastStarterIndex
+1; combMarkIndex
<buffer
.length();) {
653 uint8_t cc
=buffer
.ccAt(combMarkIndex
); // !=0 because after last starter
654 if(combinesWithCCBetween(*starterNorm
, prevCC
, cc
)) {
658 (starter
=combine(*starterNorm
, buffer
.charAt(combMarkIndex
)))>=0
660 buffer
.setComposite(starter
, combMarkIndex
);
661 starterNorm
=&getNormRef(starter
);
662 if(starterNorm
->compositions
==NULL
) {
663 return FALSE
; // the composite does not combine further
670 // TRUE if the final, forward-combining starter is at the end.
674 // Requires p->hasMapping().
675 // Returns the offset of the "first unit" from the beginning of the extraData for c.
676 // That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
677 int32_t Normalizer2DataBuilder::writeMapping(UChar32 c
, const Norm
*p
, UnicodeString
&dataString
) {
678 UnicodeString
&m
=*p
->mapping
;
679 int32_t length
=m
.length();
680 if(length
>Normalizer2Impl::MAPPING_LENGTH_MASK
) {
683 "mapping for U+%04lX longer than maximum of %d\n",
684 (long)c
, Normalizer2Impl::MAPPING_LENGTH_MASK
);
685 exit(U_INVALID_FORMAT_ERROR
);
687 int32_t leadCC
, trailCC
;
691 leadCC
=getCC(m
.char32At(0));
692 trailCC
=getCC(m
.char32At(length
-1));
694 if(c
<Normalizer2Impl::MIN_CCC_LCCC_CP
&& (p
->cc
!=0 || leadCC
!=0)) {
697 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
699 exit(U_INVALID_FORMAT_ERROR
);
701 // Write small-FCD data.
702 if((leadCC
|trailCC
)!=0) {
703 UChar32 lead
= c
<=0xffff ? c
: U16_LEAD(c
);
704 smallFCD
[lead
>>8]|=(uint8_t)1<<((lead
>>5)&7);
706 // Write the mapping & raw mapping extraData.
707 int32_t firstUnit
=length
|(trailCC
<<8);
708 int32_t preMappingLength
=0;
709 if(p
->rawMapping
!=NULL
) {
710 UnicodeString
&rm
=*p
->rawMapping
;
711 int32_t rmLength
=rm
.length();
712 if(rmLength
>Normalizer2Impl::MAPPING_LENGTH_MASK
) {
715 "raw mapping for U+%04lX longer than maximum of %d\n",
716 (long)c
, Normalizer2Impl::MAPPING_LENGTH_MASK
);
717 exit(U_INVALID_FORMAT_ERROR
);
719 UChar rm0
=rm
.charAt(0);
720 if( rmLength
==length
-1 &&
721 // 99: overlong substring lengths get pinned to remainder lengths anyway
722 0==rm
.compare(1, 99, m
, 2, 99) &&
723 rm0
>Normalizer2Impl::MAPPING_LENGTH_MASK
726 // rawMapping=rm0+mapping.substring(2) -> store only rm0
728 // The raw mapping is the same as the final mapping after replacing
729 // the final mapping's first two code units with the raw mapping's first one.
730 // In this case, we store only that first unit, rm0.
731 // This helps with a few hundred mappings.
732 dataString
.append(rm0
);
735 // Store the raw mapping with its length.
736 dataString
.append(rm
);
737 dataString
.append((UChar
)rmLength
);
738 preMappingLength
=rmLength
+1;
740 firstUnit
|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING
;
742 int32_t cccLccc
=p
->cc
|(leadCC
<<8);
744 dataString
.append((UChar
)cccLccc
);
746 firstUnit
|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD
;
748 if(p
->hasNoCompBoundaryAfter
) {
749 firstUnit
|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER
;
751 dataString
.append((UChar
)firstUnit
);
752 dataString
.append(m
);
753 return preMappingLength
;
756 // Requires p->compositions!=NULL.
757 void Normalizer2DataBuilder::writeCompositions(UChar32 c
, const Norm
*p
, UnicodeString
&dataString
) {
761 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
763 exit(U_INVALID_FORMAT_ERROR
);
766 const CompositionPair
*pairs
=p
->getCompositionPairs(length
);
767 for(int32_t i
=0; i
<length
; ++i
) {
768 const CompositionPair
&pair
=pairs
[i
];
769 // 22 bits for the composite character and whether it combines forward.
770 UChar32 compositeAndFwd
=pair
.composite
<<1;
771 if(getNormRef(pair
.composite
).compositions
!=NULL
) {
772 compositeAndFwd
|=1; // The composite character also combines-forward.
774 // Encode most pairs in two units and some in three.
775 int32_t firstUnit
, secondUnit
, thirdUnit
;
776 if(pair
.trail
<Normalizer2Impl::COMP_1_TRAIL_LIMIT
) {
777 if(compositeAndFwd
<=0xffff) {
778 firstUnit
=pair
.trail
<<1;
779 secondUnit
=compositeAndFwd
;
782 firstUnit
=(pair
.trail
<<1)|Normalizer2Impl::COMP_1_TRIPLE
;
783 secondUnit
=compositeAndFwd
>>16;
784 thirdUnit
=compositeAndFwd
;
787 firstUnit
=(Normalizer2Impl::COMP_1_TRAIL_LIMIT
+
788 (pair
.trail
>>Normalizer2Impl::COMP_1_TRAIL_SHIFT
))|
789 Normalizer2Impl::COMP_1_TRIPLE
;
790 secondUnit
=(pair
.trail
<<Normalizer2Impl::COMP_2_TRAIL_SHIFT
)|
791 (compositeAndFwd
>>16);
792 thirdUnit
=compositeAndFwd
;
794 // Set the high bit of the first unit if this is the last composition pair.
796 firstUnit
|=Normalizer2Impl::COMP_1_LAST_TUPLE
;
798 dataString
.append((UChar
)firstUnit
).append((UChar
)secondUnit
);
800 dataString
.append((UChar
)thirdUnit
);
805 class ExtraDataWriter
: public Normalizer2DBEnumerator
{
807 ExtraDataWriter(Normalizer2DataBuilder
&b
) :
808 Normalizer2DBEnumerator(b
),
809 yesYesCompositions(1000, (UChar32
)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
810 yesNoMappingsAndCompositions(1000, (UChar32
)0, 1) {} // 0=Hangul, 1=start of normal data
811 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
815 "gennorm2 error: unexpected shared data for "
816 "multiple code points U+%04lX..U+%04lX\n",
817 (long)start
, (long)end
);
818 exit(U_INTERNAL_PROGRAM_ERROR
);
820 builder
.writeExtraData(start
, value
, *this);
824 UnicodeString maybeYesCompositions
;
825 UnicodeString yesYesCompositions
;
826 UnicodeString yesNoMappingsAndCompositions
;
827 UnicodeString yesNoMappingsOnly
;
828 UnicodeString noNoMappings
;
829 Hashtable previousNoNoMappings
; // If constructed in runtime code, pass in UErrorCode.
832 void Normalizer2DataBuilder::writeExtraData(UChar32 c
, uint32_t value
, ExtraDataWriter
&writer
) {
834 if(!p
->hasMapping()) {
835 // Write small-FCD data.
836 // There is similar code in writeMapping() for characters that do have a mapping.
837 if(c
<Normalizer2Impl::MIN_CCC_LCCC_CP
&& p
->cc
!=0) {
840 "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
842 exit(U_INVALID_FORMAT_ERROR
);
845 UChar32 lead
= c
<=0xffff ? c
: U16_LEAD(c
);
846 smallFCD
[lead
>>8]|=(uint8_t)1<<((lead
>>5)&7);
849 if(p
->combinesBack
) {
850 if(p
->hasMapping()) {
853 "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
855 exit(U_INVALID_FORMAT_ERROR
);
857 if(p
->compositions
!=NULL
) {
859 (writer
.maybeYesCompositions
.length()<<Norm::OFFSET_SHIFT
)|
860 Norm::OFFSET_MAYBE_YES
;
861 writeCompositions(c
, p
, writer
.maybeYesCompositions
);
863 } else if(!p
->hasMapping()) {
864 if(p
->compositions
!=NULL
) {
866 (writer
.yesYesCompositions
.length()<<Norm::OFFSET_SHIFT
)|
867 Norm::OFFSET_YES_YES
;
868 writeCompositions(c
, p
, writer
.yesYesCompositions
);
870 } else if(p
->mappingType
==Norm::ROUND_TRIP
) {
871 if(p
->compositions
!=NULL
) {
872 int32_t offset
=writer
.yesNoMappingsAndCompositions
.length()+
873 writeMapping(c
, p
, writer
.yesNoMappingsAndCompositions
);
874 p
->offset
=(offset
<<Norm::OFFSET_SHIFT
)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION
;
875 writeCompositions(c
, p
, writer
.yesNoMappingsAndCompositions
);
877 int32_t offset
=writer
.yesNoMappingsOnly
.length()+
878 writeMapping(c
, p
, writer
.yesNoMappingsOnly
);
879 p
->offset
=(offset
<<Norm::OFFSET_SHIFT
)|Norm::OFFSET_YES_NO_MAPPING_ONLY
;
881 } else /* one-way */ {
882 if(p
->compositions
!=NULL
) {
885 "U+%04lX combines-forward and has a one-way mapping, "
886 "not possible in Unicode normalization\n",
888 exit(U_INVALID_FORMAT_ERROR
);
890 if(p
->cc
==0 && optimization
!=OPTIMIZE_FAST
) {
891 // Try a compact, algorithmic encoding.
892 // Only for ccc=0, because we can't store additional information
893 // and we do not recursively follow an algorithmic encoding for access to the ccc.
895 // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
896 // if the mappingCP decomposes further, to ensure that there is a place to store it.
897 // We want to see that the final mapping does not have exactly 1 code point,
898 // or else we would have to recursively ensure that the final mapping is stored
899 // in normal extraData.
900 if(p
->mappingCP
>=0 && (!p
->hasNoCompBoundaryAfter
|| 1!=p
->mapping
->countChar32())) {
901 int32_t delta
=p
->mappingCP
-c
;
902 if(-Normalizer2Impl::MAX_DELTA
<=delta
&& delta
<=Normalizer2Impl::MAX_DELTA
) {
903 p
->offset
=(delta
<<Norm::OFFSET_SHIFT
)|Norm::OFFSET_DELTA
;
908 int32_t oldNoNoLength
=writer
.noNoMappings
.length();
909 int32_t offset
=oldNoNoLength
+writeMapping(c
, p
, writer
.noNoMappings
);
910 UnicodeString newMapping
=writer
.noNoMappings
.tempSubString(oldNoNoLength
);
911 int32_t previousOffset
=writer
.previousNoNoMappings
.geti(newMapping
);
912 if(previousOffset
!=0) {
913 // Duplicate, remove the new units and point to the old ones.
914 writer
.noNoMappings
.truncate(oldNoNoLength
);
915 p
->offset
=((previousOffset
-1)<<Norm::OFFSET_SHIFT
)|Norm::OFFSET_NO_NO
;
917 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
918 IcuToolErrorCode
errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
919 writer
.previousNoNoMappings
.puti(newMapping
, offset
+1, errorCode
);
920 p
->offset
=(offset
<<Norm::OFFSET_SHIFT
)|Norm::OFFSET_NO_NO
;
926 class Norm16Writer
: public Normalizer2DBEnumerator
{
928 Norm16Writer(Normalizer2DataBuilder
&b
) : Normalizer2DBEnumerator(b
) {}
929 virtual UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
930 builder
.writeNorm16(start
, end
, value
);
935 void Normalizer2DataBuilder::writeNorm16(UChar32 start
, UChar32 end
, uint32_t value
) {
937 const Norm
*p
=norms
+value
;
938 int32_t offset
=p
->offset
>>Norm::OFFSET_SHIFT
;
940 UBool isDecompNo
=FALSE
;
941 UBool isCompNoMaybe
=FALSE
;
942 switch(p
->offset
&Norm::OFFSET_MASK
) {
943 case Norm::OFFSET_NONE
:
944 // No mapping, no compositions list.
945 if(p
->combinesBack
) {
946 norm16
=Normalizer2Impl::MIN_NORMAL_MAYBE_YES
+p
->cc
;
947 isDecompNo
=(UBool
)(p
->cc
!=0);
949 } else if(p
->cc
!=0) {
950 norm16
=Normalizer2Impl::MIN_YES_YES_WITH_CC
-1+p
->cc
;
951 isDecompNo
=isCompNoMaybe
=TRUE
;
954 case Norm::OFFSET_MAYBE_YES
:
955 norm16
=indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]+offset
;
958 case Norm::OFFSET_YES_YES
:
961 case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION
:
962 norm16
=indexes
[Normalizer2Impl::IX_MIN_YES_NO
]+offset
;
965 case Norm::OFFSET_YES_NO_MAPPING_ONLY
:
966 norm16
=indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]+offset
;
969 case Norm::OFFSET_NO_NO
:
970 norm16
=indexes
[Normalizer2Impl::IX_MIN_NO_NO
]+offset
;
971 isDecompNo
=isCompNoMaybe
=TRUE
;
973 case Norm::OFFSET_DELTA
:
974 norm16
=getCenterNoNoDelta()+offset
;
975 isDecompNo
=isCompNoMaybe
=TRUE
;
977 default: // Should not occur.
978 exit(U_INTERNAL_PROGRAM_ERROR
);
980 IcuToolErrorCode
errorCode("gennorm2/writeNorm16()");
981 utrie2_setRange32(norm16Trie
, start
, end
, (uint32_t)norm16
, TRUE
, errorCode
);
982 if(isDecompNo
&& start
<indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]) {
983 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=start
;
985 if(isCompNoMaybe
&& start
<indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]) {
986 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=start
;
991 void Normalizer2DataBuilder::setHangulData() {
993 const HangulIterator::Range
*range
;
994 // Check that none of the Hangul/Jamo code points have data.
995 while((range
=hi
.nextRange())!=NULL
) {
996 for(UChar32 c
=range
->start
; c
<range
->limit
; ++c
) {
997 if(utrie2_get32(norm16Trie
, c
)!=0) {
1000 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
1002 exit(U_INVALID_FORMAT_ERROR
);
1006 // Set data for algorithmic runtime handling.
1007 IcuToolErrorCode
errorCode("gennorm2/setHangulData()");
1009 while((range
=hi
.nextRange())!=NULL
) {
1010 uint16_t norm16
=range
->norm16
;
1012 norm16
=(uint16_t)indexes
[Normalizer2Impl::IX_MIN_YES_NO
]; // Hangul LV/LVT encoded as minYesNo
1013 if(range
->start
<indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]) {
1014 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=range
->start
;
1017 if(range
->start
<indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]) { // Jamo V/T are maybeYes
1018 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=range
->start
;
1021 utrie2_setRange32(norm16Trie
, range
->start
, range
->limit
-1, norm16
, TRUE
, errorCode
);
1022 errorCode
.assertSuccess();
1028 static UBool U_CALLCONV
1029 enumRangeMaxValue(const void *context
, UChar32
/*start*/, UChar32
/*end*/, uint32_t value
) {
1030 uint32_t *pMaxValue
=(uint32_t *)context
;
1031 if(value
>*pMaxValue
) {
1039 void Normalizer2DataBuilder::processData() {
1040 IcuToolErrorCode
errorCode("gennorm2/processData()");
1041 norm16Trie
=utrie2_open(0, 0, errorCode
);
1042 errorCode
.assertSuccess();
1044 utrie2_enum(normTrie
, NULL
, enumRangeHandler
, CompositionBuilder(*this).ptr());
1046 Decomposer
decomposer(*this);
1048 decomposer
.didDecompose
=FALSE
;
1049 utrie2_enum(normTrie
, NULL
, enumRangeHandler
, &decomposer
);
1050 } while(decomposer
.didDecompose
);
1052 BuilderReorderingBuffer buffer
;
1053 int32_t normsLength
=utm_countItems(normMem
);
1054 for(int32_t i
=1; i
<normsLength
; ++i
) {
1055 // Set the hasNoCompBoundaryAfter flag for use by the last code branch
1056 // in Normalizer2Impl::hasCompBoundaryAfter().
1057 // For details see the comments on hasNoCompBoundaryAfter(buffer).
1058 const Norm
&norm
=norms
[i
];
1059 if(norm
.hasMapping()) {
1060 if(norm
.compositions
!=NULL
) {
1061 norms
[i
].hasNoCompBoundaryAfter
=TRUE
;
1064 reorder(norms
+i
, buffer
);
1065 norms
[i
].hasNoCompBoundaryAfter
=hasNoCompBoundaryAfter(buffer
);
1070 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=0x110000;
1071 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=0x110000;
1073 ExtraDataWriter
extraDataWriter(*this);
1074 utrie2_enum(normTrie
, NULL
, enumRangeHandler
, &extraDataWriter
);
1076 extraData
=extraDataWriter
.maybeYesCompositions
;
1077 extraData
.append(extraDataWriter
.yesYesCompositions
).
1078 append(extraDataWriter
.yesNoMappingsAndCompositions
).
1079 append(extraDataWriter
.yesNoMappingsOnly
).
1080 append(extraDataWriter
.noNoMappings
);
1081 // Pad to even length for 4-byte alignment of following data.
1082 if(extraData
.length()&1) {
1083 extraData
.append((UChar
)0);
1086 indexes
[Normalizer2Impl::IX_MIN_YES_NO
]=
1087 extraDataWriter
.yesYesCompositions
.length();
1088 indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]=
1089 indexes
[Normalizer2Impl::IX_MIN_YES_NO
]+
1090 extraDataWriter
.yesNoMappingsAndCompositions
.length();
1091 indexes
[Normalizer2Impl::IX_MIN_NO_NO
]=
1092 indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]+
1093 extraDataWriter
.yesNoMappingsOnly
.length();
1094 indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]=
1095 indexes
[Normalizer2Impl::IX_MIN_NO_NO
]+
1096 extraDataWriter
.noNoMappings
.length();
1097 indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]=
1098 Normalizer2Impl::MIN_NORMAL_MAYBE_YES
-
1099 extraDataWriter
.maybeYesCompositions
.length();
1101 int32_t minNoNoDelta
=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA
;
1102 if(indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]>minNoNoDelta
) {
1105 "data structure overflow, too much mapping composition data\n");
1106 exit(U_BUFFER_OVERFLOW_ERROR
);
1109 utrie2_enum(normTrie
, NULL
, enumRangeHandler
, Norm16Writer(*this).ptr());
1113 // Look for the "worst" norm16 value of any supplementary code point
1114 // corresponding to a lead surrogate, and set it as that surrogate's value.
1115 // Enables quick check inner loops to look at only code units.
1117 // We could be more sophisticated:
1118 // We could collect a bit set for whether there are values in the different
1119 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
1120 // and select the best value that only breaks the composition and/or decomposition
1121 // inner loops if necessary.
1122 // However, that seems like overkill for an optimization for supplementary characters.
1123 for(UChar lead
=0xd800; lead
<0xdc00; ++lead
) {
1124 uint32_t maxValue
=utrie2_get32(norm16Trie
, lead
);
1125 utrie2_enumForLeadSurrogate(norm16Trie
, lead
, NULL
, enumRangeMaxValue
, &maxValue
);
1126 if( maxValue
>=(uint32_t)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
] &&
1127 maxValue
>(uint32_t)indexes
[Normalizer2Impl::IX_MIN_NO_NO
]
1129 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
1130 // Otherwise it might end up at something like JAMO_VT which stays in
1131 // the inner decomposition quick check loop.
1132 maxValue
=(uint32_t)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]-1;
1134 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie
, lead
, maxValue
, errorCode
);
1137 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
1138 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
1139 // which is harmless.
1140 // As a result, the minimum code points are always BMP code points.
1141 int32_t minCP
=indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
];
1142 if(minCP
>=0x10000) {
1143 indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]=U16_LEAD(minCP
);
1145 minCP
=indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
];
1146 if(minCP
>=0x10000) {
1147 indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]=U16_LEAD(minCP
);
1150 utrie2_freeze(norm16Trie
, UTRIE2_16_VALUE_BITS
, errorCode
);
1151 norm16TrieLength
=utrie2_serialize(norm16Trie
, NULL
, 0, errorCode
);
1152 if(errorCode
.get()!=U_BUFFER_OVERFLOW_ERROR
) {
1153 fprintf(stderr
, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
1154 errorCode
.errorName());
1155 exit(errorCode
.reset());
1159 int32_t offset
=(int32_t)sizeof(indexes
);
1160 indexes
[Normalizer2Impl::IX_NORM_TRIE_OFFSET
]=offset
;
1161 offset
+=norm16TrieLength
;
1162 indexes
[Normalizer2Impl::IX_EXTRA_DATA_OFFSET
]=offset
;
1163 offset
+=extraData
.length()*2;
1164 indexes
[Normalizer2Impl::IX_SMALL_FCD_OFFSET
]=offset
;
1165 offset
+=sizeof(smallFCD
);
1166 int32_t totalSize
=offset
;
1167 for(int32_t i
=Normalizer2Impl::IX_RESERVED3_OFFSET
; i
<=Normalizer2Impl::IX_TOTAL_SIZE
; ++i
) {
1168 indexes
[i
]=totalSize
;
1172 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength
);
1173 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData
.length());
1174 printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD
));
1175 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize
);
1176 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_DECOMP_NO_CP
]);
1177 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes
[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP
]);
1178 printf("minYesNo: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_YES_NO
]);
1179 printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
]);
1180 printf("minNoNo: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_NO_NO
]);
1181 printf("limitNoNo: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_LIMIT_NO_NO
]);
1182 printf("minMaybeYes: 0x%04x\n", (int)indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
]);
1185 UVersionInfo nullVersion
={ 0, 0, 0, 0 };
1186 if(0==memcmp(nullVersion
, unicodeVersion
, 4)) {
1187 u_versionFromString(unicodeVersion
, U_UNICODE_VERSION
);
1189 memcpy(dataInfo
.dataVersion
, unicodeVersion
, 4);
1192 void Normalizer2DataBuilder::writeBinaryFile(const char *filename
) {
1195 IcuToolErrorCode
errorCode("gennorm2/writeBinaryFile()");
1196 LocalArray
<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength
]);
1197 utrie2_serialize(norm16Trie
, norm16TrieBytes
.getAlias(), norm16TrieLength
, errorCode
);
1198 errorCode
.assertSuccess();
1200 UNewDataMemory
*pData
=
1201 udata_create(NULL
, NULL
, filename
, &dataInfo
,
1202 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, errorCode
);
1203 if(errorCode
.isFailure()) {
1204 fprintf(stderr
, "gennorm2 error: unable to create the output file %s - %s\n",
1205 filename
, errorCode
.errorName());
1206 exit(errorCode
.reset());
1208 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
1209 udata_writeBlock(pData
, norm16TrieBytes
.getAlias(), norm16TrieLength
);
1210 udata_writeUString(pData
, extraData
.getBuffer(), extraData
.length());
1211 udata_writeBlock(pData
, smallFCD
, sizeof(smallFCD
));
1212 int32_t writtenSize
=udata_finish(pData
, errorCode
);
1213 if(errorCode
.isFailure()) {
1214 fprintf(stderr
, "gennorm2: error %s writing the output file\n", errorCode
.errorName());
1215 exit(errorCode
.reset());
1217 int32_t totalSize
=indexes
[Normalizer2Impl::IX_TOTAL_SIZE
];
1218 if(writtenSize
!=totalSize
) {
1219 fprintf(stderr
, "gennorm2 error: written size %ld != calculated size %ld\n",
1220 (long)writtenSize
, (long)totalSize
);
1221 exit(U_INTERNAL_PROGRAM_ERROR
);
1226 Normalizer2DataBuilder::writeCSourceFile(const char *filename
) {
1229 IcuToolErrorCode
errorCode("gennorm2/writeCSourceFile()");
1230 const char *basename
=findBasename(filename
);
1231 CharString
path(filename
, (int32_t)(basename
-filename
), errorCode
);
1232 CharString
dataName(basename
, errorCode
);
1233 const char *extension
=strrchr(basename
, '.');
1234 if(extension
!=NULL
) {
1235 dataName
.truncate((int32_t)(extension
-basename
));
1237 errorCode
.assertSuccess();
1239 LocalArray
<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength
]);
1240 utrie2_serialize(norm16Trie
, norm16TrieBytes
.getAlias(), norm16TrieLength
, errorCode
);
1241 errorCode
.assertSuccess();
1243 FILE *f
=usrc_create(path
.data(), basename
, "icu/source/tools/gennorm2/n2builder.cpp");
1245 fprintf(stderr
, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
1247 exit(U_FILE_ACCESS_ERROR
);
1251 sprintf(line
, "static const UVersionInfo %s_formatVersion={", dataName
.data());
1252 usrc_writeArray(f
, line
, dataInfo
.formatVersion
, 8, 4, "};\n");
1253 sprintf(line
, "static const UVersionInfo %s_dataVersion={", dataName
.data());
1254 usrc_writeArray(f
, line
, dataInfo
.dataVersion
, 8, 4, "};\n\n");
1255 sprintf(line
, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n",
1259 indexes
, 32, Normalizer2Impl::IX_COUNT
,
1261 sprintf(line
, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName
.data());
1262 usrc_writeUTrie2Arrays(f
,
1266 sprintf(line
, "static const uint16_t %s_extraData[%%ld]={\n", dataName
.data());
1269 extraData
.getBuffer(), 16, extraData
.length(),
1271 sprintf(line
, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName
.data());
1274 smallFCD
, 8, sizeof(smallFCD
),
1277 "static const UCaseProps %s_singleton={\n"
1283 sprintf(line
, "static const UTrie2 %s_trie={\n", dataName
.data());
1285 sprintf(line2
, "%s_trieIndex", dataName
.data());
1286 usrc_writeUTrie2Struct(f
,
1288 norm16Trie
, line2
, NULL
,
1295 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1298 * Hey, Emacs, please set the following:
1301 * indent-tabs-mode: nil