2 *******************************************************************************
4 * Copyright (C) 1999-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2001may25
14 * created by: Markus W. Scherer
16 * Store Unicode normalization data in a memory-mappable file.
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
27 #include "unicode/udata.h"
29 #include "unicode/uset.h"
35 # pragma warning(disable: 4100)
38 #define DO_DEBUG_OUT 0
40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
43 * The new implementation of the normalization code loads its data from
44 * unorm.icu, which is generated with this gennorm tool.
45 * The format of that file is described in unormimp.h .
48 /* file data ---------------------------------------------------------------- */
50 #if UCONFIG_NO_NORMALIZATION
52 /* dummy UDataInfo cf. udata.h */
53 static UDataInfo dataInfo
= {
62 { 0, 0, 0, 0 }, /* dummy dataFormat */
63 { 0, 0, 0, 0 }, /* dummy formatVersion */
64 { 0, 0, 0, 0 } /* dummy dataVersion */
69 /* UDataInfo cf. udata.h */
70 static UDataInfo dataInfo
={
79 { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
80 { 2, 3, UTRIE_SHIFT
, UTRIE_INDEX_SHIFT
}, /* formatVersion */
81 { 3, 2, 0, 0 } /* dataVersion (Unicode version) */
85 setUnicodeVersion(const char *v
) {
87 u_versionFromString(version
, v
);
88 uprv_memcpy(dataInfo
.dataVersion
, version
, 4);
91 static int32_t indexes
[_NORM_INDEX_TOP
]={ 0 };
93 /* builder data ------------------------------------------------------------- */
95 typedef void EnumTrieFn(void *context
, uint32_t code
, Norm
*norm
);
103 static UToolMemory
*normMem
, *utf32Mem
, *extraMem
, *combiningTriplesMem
;
108 * set a flag for each code point that was seen in decompositions -
109 * avoid to decompose ones that have not been used before
111 static uint32_t haveSeenFlags
[256];
113 /* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
114 static USet
*nfdQCNoSet
;
116 /* see addCombiningCP() for details */
117 static uint32_t combiningCPs
[2000];
120 * after processCombining() this contains for each code point in combiningCPs[]
121 * the runtime combining index
123 static uint16_t combiningIndexes
[2000];
125 /* section limits for combiningCPs[], see addCombiningCP() */
126 static uint16_t combineFwdTop
=0, combineBothTop
=0, combineBackTop
=0;
129 * Structure for a triple of code points, stored in combiningTriplesMem.
130 * The lead and trail code points combine into the the combined one,
131 * i.e., there is a canonical decomposition of combined-> <lead, trail>.
133 * Before processCombining() is called, leadIndex and trailIndex are 0.
134 * After processCombining(), they contain the indexes of the lead and trail
135 * code point in the combiningCPs[] array.
136 * They are then sorted by leadIndex, then trailIndex.
137 * They are not sorted by code points.
139 typedef struct CombiningTriple
{
140 uint16_t leadIndex
, trailIndex
;
141 uint32_t lead
, trail
, combined
;
144 /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
145 static uint16_t combiningTable
[0x8000];
146 static uint16_t combiningTableTop
=0;
148 #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
149 static uint16_t canonStartSets
[_NORM_MAX_CANON_SETS
+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
150 +10000]; /* +10000 for exclusion sets */
151 static int32_t canonStartSetsTop
=_NORM_SET_INDEX_TOP
;
152 static int32_t canonSetsCount
=0;
158 normTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
159 uprv_memset(normTrie
, 0, sizeof(UNewTrie
));
160 norm32Trie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
161 uprv_memset(norm32Trie
, 0, sizeof(UNewTrie
));
162 fcdTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
163 uprv_memset(fcdTrie
, 0, sizeof(UNewTrie
));
164 auxTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
165 uprv_memset(auxTrie
, 0, sizeof(UNewTrie
));
167 /* initialize the two tries */
168 if(NULL
==utrie_open(normTrie
, NULL
, 30000, 0, 0, FALSE
)) {
169 fprintf(stderr
, "error: failed to initialize tries\n");
170 exit(U_MEMORY_ALLOCATION_ERROR
);
173 /* allocate Norm structures and reset the first one */
174 normMem
=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm
));
175 norms
=utm_alloc(normMem
);
177 /* allocate UTF-32 string memory */
178 utf32Mem
=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
180 /* reset all "have seen" flags */
181 uprv_memset(haveSeenFlags
, 0, sizeof(haveSeenFlags
));
183 /* open an empty set */
184 nfdQCNoSet
=uset_open(1, 0);
186 /* allocate extra data memory for UTF-16 decomposition strings and other values */
187 extraMem
=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP
, _NORM_EXTRA_INDEX_TOP
, 2);
188 /* initialize the extraMem counter for the top of FNC strings */
189 p16
=(uint16_t *)utm_alloc(extraMem
);
192 /* allocate temporary memory for combining triples */
193 combiningTriplesMem
=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple
));
195 /* set the minimum code points for no/maybe quick check values to the end of the BMP */
196 indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
]=0xffff;
197 indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
]=0xffff;
198 indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]=0xffff;
199 indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
]=0xffff;
201 /* preset the indexes portion of canonStartSets */
202 uprv_memset(canonStartSets
, 0, _NORM_SET_INDEX_TOP
*2);
206 * get or create a Norm unit;
207 * get or create the intermediate trie entries for it as well
210 createNorm(uint32_t code
) {
214 i
=utrie_get32(normTrie
, (UChar32
)code
, NULL
);
219 p
=(Norm
*)utm_alloc(normMem
);
220 if(!utrie_set32(normTrie
, (UChar32
)code
, (uint32_t)(p
-norms
))) {
221 fprintf(stderr
, "error: too many normalization entries\n");
222 exit(U_BUFFER_OVERFLOW_ERROR
);
228 /* get an existing Norm unit */
230 getNorm(uint32_t code
) {
233 i
=utrie_get32(normTrie
, (UChar32
)code
, NULL
);
240 /* get the canonical combining class of a character */
242 getCCFromCP(uint32_t code
) {
243 Norm
*norm
=getNorm(code
);
247 return norm
->udataCC
;
252 * enumerate all code points with their Norm structs and call a function for each
253 * return the number of code points with data
256 enumTrie(EnumTrieFn
*fn
, void *context
) {
262 for(code
=0; code
<=0x10ffff;) {
263 i
=utrie_get32(normTrie
, code
, &isInBlockZero
);
265 code
+=UTRIE_DATA_BLOCK_LENGTH
;
268 fn(context
, (uint32_t)code
, norms
+i
);
278 setHaveSeenString(const uint32_t *s
, int32_t length
) {
283 haveSeenFlags
[(c
>>5)&0xff]|=(1<<(c
&0x1f));
288 #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
290 /* handle combining data ---------------------------------------------------- */
293 * Insert an entry into combiningCPs[] for the new code point code with its flags.
294 * The flags indicate if code combines forward, backward, or both.
296 * combiningCPs[] contains three sections:
297 * 1. code points that combine forward
298 * 2. code points that combine forward and backward
299 * 3. code points that combine backward
301 * Search for code in the entire array.
302 * If it is found and already is in the right section (old flags==new flags)
304 * If it is found but the flags are different, then remove it,
305 * union the old and new flags, and reinsert it into its correct section.
306 * If it is not found, then just insert it.
308 * Within each section, the code points are not sorted.
311 addCombiningCP(uint32_t code
, uint8_t flags
) {
315 newEntry
=code
|((uint32_t)flags
<<24);
317 /* search for this code point */
318 for(i
=0; i
<combineBackTop
; ++i
) {
319 if(code
==(combiningCPs
[i
]&0xffffff)) {
321 if(newEntry
==combiningCPs
[i
]) {
322 return; /* no change */
325 /* combine the flags, remove the old entry from the old place, and insert the new one */
326 newEntry
|=combiningCPs
[i
];
327 if(i
!=--combineBackTop
) {
328 uprv_memmove(combiningCPs
+i
, combiningCPs
+i
+1, (combineBackTop
-i
)*4);
330 if(i
<combineBothTop
) {
333 if(i
<combineFwdTop
) {
340 /* not found or modified, insert it */
341 if(combineBackTop
>=sizeof(combiningCPs
)/4) {
342 fprintf(stderr
, "error: gennorm combining code points - trying to use more than %ld units\n",
343 (long)(sizeof(combiningCPs
)/4));
344 exit(U_MEMORY_ALLOCATION_ERROR
);
347 /* set i to the insertion point */
348 flags
=(uint8_t)(newEntry
>>24);
352 } else if(flags
==3) {
354 } else /* flags==2 */ {
358 /* move the following code points up one and insert newEntry at i */
359 if(i
<combineBackTop
) {
360 uprv_memmove(combiningCPs
+i
+1, combiningCPs
+i
, (combineBackTop
-i
)*4);
362 combiningCPs
[i
]=newEntry
;
364 /* finally increment the total counter */
369 * Find the index in combiningCPs[] where code point code is stored.
370 * @param code code point to look for
371 * @param isLead is code a forward combining code point?
372 * @return index in combiningCPs[] where code is stored
375 findCombiningCP(uint32_t code
, UBool isLead
) {
380 limit
=combineBothTop
;
383 limit
=combineBackTop
;
386 /* search for this code point */
387 for(; i
<limit
; ++i
) {
388 if(code
==(combiningCPs
[i
]&0xffffff)) {
399 addCombiningTriple(uint32_t lead
, uint32_t trail
, uint32_t combined
) {
400 CombiningTriple
*triple
;
403 * set combiningFlags for the two code points
404 * do this after decomposition so that getNorm() above returns NULL
405 * if we do not have actual sub-decomposition data for the initial NFD here
407 createNorm(lead
)->combiningFlags
|=1; /* combines forward */
408 createNorm(trail
)->combiningFlags
|=2; /* combines backward */
410 addCombiningCP(lead
, 1);
411 addCombiningCP(trail
, 2);
413 triple
=(CombiningTriple
*)utm_alloc(combiningTriplesMem
);
416 triple
->combined
=combined
;
420 compareTriples(const void *l
, const void *r
) {
422 diff
=(int)((CombiningTriple
*)l
)->leadIndex
-
423 (int)((CombiningTriple
*)r
)->leadIndex
;
425 diff
=(int)((CombiningTriple
*)l
)->trailIndex
-
426 (int)((CombiningTriple
*)r
)->trailIndex
;
433 CombiningTriple
*triples
;
436 uint16_t i
, j
, count
, tableTop
, finalIndex
, combinesFwd
;
438 triples
=utm_getStart(combiningTriplesMem
);
440 /* add lead and trail indexes to the triples for sorting */
441 count
=(uint16_t)utm_countItems(combiningTriplesMem
);
442 for(i
=0; i
<count
; ++i
) {
443 /* findCombiningCP() must always find the code point */
444 triples
[i
].leadIndex
=findCombiningCP(triples
[i
].lead
, TRUE
);
445 triples
[i
].trailIndex
=findCombiningCP(triples
[i
].trail
, FALSE
);
448 /* sort them by leadIndex, trailIndex */
449 qsort(triples
, count
, sizeof(CombiningTriple
), compareTriples
);
451 /* calculate final combining indexes and store them in the Norm entries */
453 j
=0; /* triples counter */
455 /* first, combining indexes of fwd/both characters are indexes into the combiningTable */
456 for(i
=0; i
<combineBothTop
; ++i
) {
457 /* start a new table */
459 /* assign combining index */
460 createNorm(combiningCPs
[i
]&0xffffff)->combiningIndex
=combiningIndexes
[i
]=tableTop
;
462 /* calculate the length of the combining data for this lead code point in the combiningTable */
463 while(j
<count
&& i
==triples
[j
].leadIndex
) {
464 /* count 2 to 3 16-bit units per composition entry (back-index, code point) */
465 combined
=triples
[j
++].combined
;
466 if(combined
<=0x1fff) {
474 /* second, combining indexes of back-only characters are simply incremented from here to be unique */
476 for(; i
<combineBackTop
; ++i
) {
477 createNorm(combiningCPs
[i
]&0xffffff)->combiningIndex
=combiningIndexes
[i
]=finalIndex
++;
480 /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
481 if(finalIndex
>0x8000) {
482 fprintf(stderr
, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
483 tableTop
, (long)(sizeof(combiningTable
)/4));
484 exit(U_MEMORY_ALLOCATION_ERROR
);
487 combiningTableTop
=tableTop
;
489 /* store the combining data in the combiningTable, with the final indexes from above */
491 j
=0; /* triples counter */
494 * this is essentially the same loop as above, but
495 * it writes the table data instead of calculating and setting the final indexes;
496 * it is necessary to have two passes so that all the final indexes are known before
497 * they are written into the table
499 for(i
=0; i
<combineBothTop
; ++i
) {
500 /* start a new table */
502 combined
=0; /* avoid compiler warning */
504 /* store the combining data for this lead code point in the combiningTable */
505 while(j
<count
&& i
==triples
[j
].leadIndex
) {
506 finalIndex
=combiningIndexes
[triples
[j
].trailIndex
];
507 combined
=triples
[j
++].combined
;
509 /* is combined a starter? (i.e., cc==0 && combines forward) */
510 combinesFwd
=(uint16_t)((getNorm(combined
)->combiningFlags
&1)<<13);
513 if(combined
<=0x1fff) {
514 *p
++=(uint16_t)(combinesFwd
|combined
);
515 } else if(combined
<=0xffff) {
516 *p
++=(uint16_t)(0x8000|combinesFwd
);
517 *p
++=(uint16_t)combined
;
519 *p
++=(uint16_t)(0xc000|combinesFwd
|((combined
-0x10000)>>10));
520 *p
++=(uint16_t)(0xdc00|(combined
&0x3ff));
524 /* set a marker on the last final trail index in this lead's table */
525 if(combined
<=0x1fff) {
532 /* post condition: tableTop==(p-combiningTable) */
535 /* processing incoming normalization data ----------------------------------- */
538 * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
539 * c must be a Hangul syllable code point.
542 getHangulDecomposition(uint32_t c
, Norm
*pHangulNorm
, uint32_t hangulBuffer
[3]) {
543 /* Hangul syllable: decompose algorithmically */
547 uprv_memset(pHangulNorm
, 0, sizeof(Norm
));
554 hangulBuffer
[2]=JAMO_T_BASE
+c2
;
561 hangulBuffer
[1]=JAMO_V_BASE
+c%JAMO_V_COUNT
;
562 hangulBuffer
[0]=JAMO_L_BASE
+c
/JAMO_V_COUNT
;
564 pHangulNorm
->nfd
=pHangulNorm
->nfkd
=hangulBuffer
;
565 pHangulNorm
->lenNFD
=pHangulNorm
->lenNFKD
=length
;
569 * decompose the one decomposition further, may generate two decompositions
570 * apply all previous characters' decompositions to this one
573 decompStoreNewNF(uint32_t code
, Norm
*norm
) {
574 uint32_t nfd
[40], nfkd
[40], hangulBuffer
[3];
581 uint8_t lenNFD
=0, lenNFKD
=0;
582 UBool changedNFD
=FALSE
, changedNFKD
=FALSE
;
584 if((length
=norm
->lenNFD
)!=0) {
585 /* always allocate the original string */
588 } else if((length
=norm
->lenNFKD
)!=0) {
589 /* always allocate the original string */
593 /* no decomposition here, nothing to do */
597 /* decompose each code point */
598 for(i
=0; i
<length
; ++i
) {
602 if(HANGUL_BASE
<=c
&& c
<(HANGUL_BASE
+HANGUL_COUNT
)) {
603 getHangulDecomposition(c
, &hangulNorm
, hangulBuffer
);
606 /* no data, no decomposition */
613 /* canonically decompose c */
616 uprv_memcpy(nfd
+lenNFD
, p
->nfd
, p
->lenNFD
*4);
623 /* compatibility-decompose c */
625 uprv_memcpy(nfkd
+lenNFKD
, p
->nfkd
, p
->lenNFKD
*4);
628 } else if(p
->lenNFD
!=0) {
629 uprv_memcpy(nfkd
+lenNFKD
, p
->nfd
, p
->lenNFD
*4);
637 /* assume that norm->lenNFD==1 or ==2 */
638 if(norm
->lenNFD
==2 && !(norm
->combiningFlags
&0x80)) {
639 addCombiningTriple(s32
[0], s32
[1], code
);
644 s32
=utm_allocN(utf32Mem
, lenNFD
);
645 uprv_memcpy(s32
, nfd
, lenNFD
*4);
651 setHaveSeenString(nfd
, lenNFD
);
655 s32
=utm_allocN(utf32Mem
, lenNFKD
);
656 uprv_memcpy(s32
, nfkd
, lenNFKD
*4);
660 norm
->lenNFKD
=lenNFKD
;
662 setHaveSeenString(nfkd
, lenNFKD
);
666 typedef struct DecompSingle
{
672 * apply this one character's decompositions (there is at least one!) to
673 * all previous characters' decompositions to decompose them further
676 decompWithSingleFn(void *context
, uint32_t code
, Norm
*norm
) {
677 uint32_t nfd
[40], nfkd
[40];
679 DecompSingle
*me
=(DecompSingle
*)context
;
682 uint8_t lenNFD
=0, lenNFKD
=0, myLenNFD
, myLenNFKD
;
683 UBool changedNFD
=FALSE
, changedNFKD
=FALSE
;
685 /* get the new character's data */
687 myLenNFD
=me
->norm
->lenNFD
;
688 myLenNFKD
=me
->norm
->lenNFKD
;
689 /* assume that myC has at least one decomposition */
691 if((length
=norm
->lenNFD
)!=0 && myLenNFD
!=0) {
692 /* apply NFD(myC) to norm->nfd */
694 for(i
=0; i
<length
; ++i
) {
697 uprv_memcpy(nfd
+lenNFD
, me
->norm
->nfd
, myLenNFD
*4);
706 if((length
=norm
->lenNFKD
)!=0) {
707 /* apply NFD(myC) and NFKD(myC) to norm->nfkd */
709 for(i
=0; i
<length
; ++i
) {
713 uprv_memcpy(nfkd
+lenNFKD
, me
->norm
->nfkd
, myLenNFKD
*4);
715 } else /* assume myLenNFD!=0 */ {
716 uprv_memcpy(nfkd
+lenNFKD
, me
->norm
->nfd
, myLenNFD
*4);
724 } else if((length
=norm
->lenNFD
)!=0 && myLenNFKD
!=0) {
725 /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
727 for(i
=0; i
<length
; ++i
) {
730 uprv_memcpy(nfkd
+lenNFKD
, me
->norm
->nfkd
, myLenNFKD
*4);
739 /* set the new decompositions, forget the old ones */
742 if(lenNFD
>norm
->lenNFD
) {
743 s32
=utm_allocN(utf32Mem
, lenNFD
);
747 uprv_memcpy(s32
, nfd
, lenNFD
*4);
756 if(lenNFKD
>norm
->lenNFKD
) {
757 s32
=utm_allocN(utf32Mem
, lenNFKD
);
761 uprv_memcpy(s32
, nfkd
, lenNFKD
*4);
765 norm
->lenNFKD
=lenNFKD
;
771 * process the data for one code point listed in UnicodeData;
772 * UnicodeData itself never maps a code point to both NFD and NFKD
775 storeNorm(uint32_t code
, Norm
*norm
) {
776 DecompSingle decompSingle
;
779 /* copy existing derived normalization properties */
781 norm
->qcFlags
=p
->qcFlags
;
782 norm
->combiningFlags
=p
->combiningFlags
;
783 norm
->fncIndex
=p
->fncIndex
;
785 /* process the decomposition if if there is at one here */
786 if((norm
->lenNFD
|norm
->lenNFKD
)!=0) {
787 /* decompose this one decomposition further, may generate two decompositions */
788 decompStoreNewNF(code
, norm
);
790 /* has this code point been used in previous decompositions? */
791 if(HAVE_SEEN(code
)) {
792 /* use this decomposition to decompose other decompositions further */
794 decompSingle
.norm
=norm
;
795 enumTrie(decompWithSingleFn
, &decompSingle
);
800 uprv_memcpy(p
, norm
, sizeof(Norm
));
804 setQCFlags(uint32_t code
, uint8_t qcFlags
) {
805 createNorm(code
)->qcFlags
|=qcFlags
;
807 /* adjust the minimum code point for quick check no/maybe */
809 if((qcFlags
&_NORM_QC_NFC
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
]) {
810 indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
]=(uint16_t)code
;
812 if((qcFlags
&_NORM_QC_NFKC
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
]) {
813 indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
]=(uint16_t)code
;
815 if((qcFlags
&_NORM_QC_NFD
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]) {
816 indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]=(uint16_t)code
;
818 if((qcFlags
&_NORM_QC_NFKD
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
]) {
819 indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
]=(uint16_t)code
;
823 if(qcFlags
&_NORM_QC_NFD
) {
824 uset_add(nfdQCNoSet
, (UChar32
)code
);
829 setCompositionExclusion(uint32_t code
) {
830 createNorm(code
)->combiningFlags
|=0x80;
834 setHangulJamoSpecials() {
839 * Hangul syllables are algorithmically decomposed into Jamos,
840 * and Jamos are algorithmically composed into Hangul syllables.
841 * The quick check flags are parsed, except for Hangul.
844 /* set Jamo L specials */
846 for(c
=0x1100; c
<=0x1112; ++c
) {
848 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_JAMO_L
;
849 norm
->combiningFlags
=1;
851 /* for each Jamo L create a set with its associated Hangul block */
852 norm
->canonStart
=uset_open(hangul
, hangul
+21*28-1);
856 /* set Jamo V specials */
857 for(c
=0x1161; c
<=0x1175; ++c
) {
859 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_JAMO_V
;
860 norm
->combiningFlags
=2;
861 norm
->unsafeStart
=TRUE
;
864 /* set Jamo T specials */
865 for(c
=0x11a8; c
<=0x11c2; ++c
) {
867 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_JAMO_T
;
868 norm
->combiningFlags
=2;
869 norm
->unsafeStart
=TRUE
;
872 /* set Hangul specials, precompacted */
873 norm
=(Norm
*)utm_alloc(normMem
);
874 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_HANGUL
;
875 norm
->qcFlags
=_NORM_QC_NFD
|_NORM_QC_NFKD
;
877 if(!utrie_setRange32(normTrie
, 0xac00, 0xd7a4, (uint32_t)(norm
-norms
), TRUE
)) {
878 fprintf(stderr
, "error: too many normalization entries (setting Hangul)\n");
879 exit(U_BUFFER_OVERFLOW_ERROR
);
884 * set FC-NFKC-Closure string
885 * s contains the closure string; s[0]==length, s[1..length] is the actual string
889 setFNC(uint32_t c
, UChar
*s
) {
891 int32_t length
, i
, count
;
894 count
=utm_countItems(extraMem
);
898 /* try to overlay single-unit strings with existing ones */
899 if(length
==1 && first
<0xff00) {
900 p
=utm_getStart(extraMem
);
901 for(i
=1; i
<count
; ++i
) {
910 /* append the new string if it cannot be overlayed with an old one */
912 if(count
>_NORM_AUX_MAX_FNC
) {
913 fprintf(stderr
, "gennorm error: too many FNC strings\n");
914 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
917 /* prepend 0xffxx with xx==length */
918 s
[0]=(uint16_t)(0xff00+length
);
920 p
=(uint16_t *)utm_allocN(extraMem
, length
);
921 uprv_memcpy(p
, s
, length
*2);
923 /* update the top index in extraMem[0] */
925 ((uint16_t *)utm_getStart(extraMem
))[0]=(uint16_t)count
;
928 /* store the index to the string */
929 createNorm(c
)->fncIndex
=i
;
932 /* build runtime structures ------------------------------------------------- */
934 /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
936 reorderString(uint32_t *s
, int32_t length
) {
946 for(i
=0; i
<length
; ++i
) {
947 /* get the i-th code point and its combining class */
951 /* it is a combining mark, see if it needs to be moved back */
956 break; /* found the right place */
958 /* move the previous code point here and go back */
965 /* just store the combining class */
970 return (uint16_t)(((uint16_t)ccs
[0]<<8)|ccs
[length
-1]);
973 static UBool combineAndQC
[64]={ 0 };
976 * canonically reorder the up to two decompositions
977 * and store the leading and trailing combining classes accordingly
979 * also process canonical decompositions for canonical closure
982 postParseFn(void *context
, uint32_t code
, Norm
*norm
) {
985 /* canonically order the NFD */
988 norm
->canonBothCCs
=reorderString(norm
->nfd
, length
);
991 /* canonically reorder the NFKD */
992 length
=norm
->lenNFKD
;
994 norm
->compatBothCCs
=reorderString(norm
->nfkd
, length
);
997 /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
998 if((norm
->lenNFD
!=0) != ((norm
->qcFlags
&_NORM_QC_NFD
)!=0)) {
999 fprintf(stderr
, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code
, norm
->lenNFD
, norm
->qcFlags
);
1001 if(((norm
->lenNFD
|norm
->lenNFKD
)!=0) != ((norm
->qcFlags
&(_NORM_QC_NFD
|_NORM_QC_NFKD
))!=0)) {
1002 fprintf(stderr
, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code
, norm
->lenNFD
, norm
->lenNFKD
, norm
->qcFlags
);
1005 /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
1006 combineAndQC
[(norm
->qcFlags
&0x33)|((norm
->combiningFlags
&3)<<2)]=1;
1008 if(norm
->combiningFlags
&1) {
1009 if(norm
->udataCC
!=0) {
1010 /* illegal - data-derivable composition exclusion */
1011 fprintf(stderr
, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code
, norm
->udataCC
);
1014 if(norm
->combiningFlags
&2) {
1015 if((norm
->qcFlags
&0x11)==0) {
1016 fprintf(stderr
, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code
);
1019 /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
1020 if(norm
->udataCC
==0) {
1021 printf("U+%04lx combines backward but udataCC==0\n", (long)code
);
1025 if((norm
->combiningFlags
&3)==3 && beVerbose
) {
1026 printf("U+%04lx combines both ways\n", (long)code
);
1030 * process canonical decompositions for canonical closure
1032 * in each canonical decomposition:
1033 * add the current character (code) to the set of canonical starters of its norm->nfd[0]
1034 * set the "unsafe starter" flag for each norm->nfd[1..]
1036 length
=norm
->lenNFD
;
1042 /* nfd[0].canonStart.add(code) */
1044 otherNorm
=createNorm(c
);
1045 if(otherNorm
->canonStart
==NULL
) {
1046 otherNorm
->canonStart
=uset_open(code
, code
);
1047 if(otherNorm
->canonStart
==NULL
) {
1048 fprintf(stderr
, "gennorm error: out of memory in uset_open()\n");
1049 exit(U_MEMORY_ALLOCATION_ERROR
);
1052 uset_add(otherNorm
->canonStart
, code
);
1053 if(!uset_contains(otherNorm
->canonStart
, code
)) {
1054 fprintf(stderr
, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c
, (int)code
);
1055 exit(U_INTERNAL_PROGRAM_ERROR
);
1059 /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
1060 for(i
=1; i
<length
; ++i
) {
1061 createNorm(norm
->nfd
[i
])->unsafeStart
=TRUE
;
1067 make32BitNorm(Norm
*norm
) {
1071 int32_t i
, length
, beforeZero
=0, count
, start
;
1074 * Check for assumptions:
1076 * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
1077 * then the decomposition also begins with a true starter.
1079 if(norm
->udataCC
==0) {
1080 /* this is a starter */
1081 if((norm
->qcFlags
&_NORM_QC_NFC
)==0 && norm
->lenNFD
>0) {
1082 /* a "true" NFC starter with a canonical decomposition */
1083 if( norm
->canonBothCCs
>=0x100 || /* lead cc!=0 or */
1084 ((other
=getNorm(norm
->nfd
[0]))!=NULL
&& (other
->qcFlags
&_NORM_QC_NFC
)!=0) /* nfd[0] not NFC_YES */
1087 "error: true NFC starter canonical decomposition[%u] does not begin\n"
1088 " with a true NFC starter: U+%04lx U+%04lx%s\n",
1089 norm
->lenNFD
, (long)norm
->nfd
[0], (long)norm
->nfd
[1],
1090 norm
->lenNFD
<=2 ? "" : " ...");
1091 exit(U_INVALID_TABLE_FILE
);
1095 if((norm
->qcFlags
&_NORM_QC_NFKC
)==0) {
1096 if(norm
->lenNFKD
>0) {
1097 /* a "true" NFKC starter with a compatibility decomposition */
1098 if( norm
->compatBothCCs
>=0x100 || /* lead cc!=0 or */
1099 ((other
=getNorm(norm
->nfkd
[0]))!=NULL
&& (other
->qcFlags
&_NORM_QC_NFKC
)!=0) /* nfkd[0] not NFC_YES */
1102 "error: true NFKC starter compatibility decomposition[%u] does not begin\n"
1103 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1104 norm
->lenNFKD
, (long)norm
->nfkd
[0], (long)norm
->nfkd
[1], norm
->lenNFKD
<=2 ? "" : " ...");
1105 exit(U_INVALID_TABLE_FILE
);
1107 } else if(norm
->lenNFD
>0) {
1108 /* a "true" NFKC starter with only a canonical decomposition */
1109 if( norm
->canonBothCCs
>=0x100 || /* lead cc!=0 or */
1110 ((other
=getNorm(norm
->nfd
[0]))!=NULL
&& (other
->qcFlags
&_NORM_QC_NFKC
)!=0) /* nfd[0] not NFC_YES */
1113 "error: true NFKC starter canonical decomposition[%u] does not begin\n"
1114 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1115 norm
->lenNFD
, (long)norm
->nfd
[0], (long)norm
->nfd
[1],
1116 norm
->lenNFD
<=2 ? "" : " ...");
1117 exit(U_INVALID_TABLE_FILE
);
1123 /* reset the 32-bit word and set the quick check flags */
1126 /* set the UnicodeData combining class */
1127 word
|=(uint32_t)norm
->udataCC
<<_NORM_CC_SHIFT
;
1129 /* set the combining flag and index */
1130 if(norm
->combiningFlags
&3) {
1131 word
|=(uint32_t)(norm
->combiningFlags
&3)<<6;
1134 /* set the combining index value into the extra data */
1135 if(norm
->combiningIndex
!=0) {
1136 extra
[0]=norm
->combiningIndex
;
1142 /* write the decompositions */
1143 if((norm
->lenNFD
|norm
->lenNFKD
)!=0) {
1144 extra
[count
++]=0; /* set the pieces when available, into extra[beforeZero] */
1146 length
=norm
->lenNFD
;
1148 if(norm
->canonBothCCs
!=0) {
1149 extra
[beforeZero
]|=0x80;
1150 extra
[count
++]=norm
->canonBothCCs
;
1153 for(i
=0; i
<length
; ++i
) {
1154 UTF_APPEND_CHAR_UNSAFE(extra
, count
, norm
->nfd
[i
]);
1156 extra
[beforeZero
]|=(UChar
)(count
-start
); /* set the decomp length as the number of UTF-16 code units */
1159 length
=norm
->lenNFKD
;
1161 if(norm
->compatBothCCs
!=0) {
1162 extra
[beforeZero
]|=0x8000;
1163 extra
[count
++]=norm
->compatBothCCs
;
1166 for(i
=0; i
<length
; ++i
) {
1167 UTF_APPEND_CHAR_UNSAFE(extra
, count
, norm
->nfkd
[i
]);
1169 extra
[beforeZero
]|=(UChar
)((count
-start
)<<8); /* set the decomp length as the number of UTF-16 code units */
1173 /* allocate and copy the extra data */
1177 if(norm
->specialTag
!=0) {
1178 fprintf(stderr
, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm
->specialTag
);
1179 exit(U_ILLEGAL_ARGUMENT_ERROR
);
1182 p
=(UChar
*)utm_allocN(extraMem
, count
);
1183 uprv_memcpy(p
, extra
, count
*2);
1185 /* set the extra index, offset by beforeZero */
1186 word
|=(uint32_t)(beforeZero
+(p
-(UChar
*)utm_getStart(extraMem
)))<<_NORM_EXTRA_SHIFT
;
1187 } else if(norm
->specialTag
!=0) {
1188 /* set a special tag instead of an extra index */
1189 word
|=(uint32_t)norm
->specialTag
<<_NORM_EXTRA_SHIFT
;
1195 /* turn all Norm structs into corresponding 32-bit norm values */
1198 uint32_t *pNormData
;
1200 int32_t i
, normLength
, count
;
1202 count
=(int32_t)utm_countItems(normMem
);
1203 for(i
=0; i
<count
; ++i
) {
1204 norms
[i
].value32
=make32BitNorm(norms
+i
);
1207 pNormData
=utrie_getData(norm32Trie
, &normLength
);
1210 for(i
=0; i
<normLength
; ++i
) {
1212 if(0!=(pNormData
[i
]=norms
[n
].value32
)) {
1219 * extract all Norm.canonBothCCs into the FCD table
1220 * set 32-bit values to use the common fold and compact functions
1226 int32_t i
, count
, fcdLength
;
1229 count
=utm_countItems(normMem
);
1230 for(i
=0; i
<count
; ++i
) {
1231 bothCCs
=norms
[i
].canonBothCCs
;
1233 /* if there are no decomposition cc's then use the udataCC twice */
1234 bothCCs
=norms
[i
].udataCC
;
1235 bothCCs
|=bothCCs
<<8;
1237 norms
[i
].value32
=bothCCs
;
1240 pFCDData
=utrie_getData(fcdTrie
, &fcdLength
);
1242 for(i
=0; i
<fcdLength
; ++i
) {
1244 pFCDData
[i
]=norms
[n
].value32
;
1249 * If the given set contains exactly one character, then return it.
1250 * Otherwise return -1.
1253 usetContainsOne(const USet
* set
) {
1254 if (uset_size(set
) == 1) { /* ### faster to count ranges and check only range?! */
1256 UErrorCode ec
= U_ZERO_ERROR
;
1257 int32_t len
= uset_getItem(set
, 0, &start
, &end
, NULL
, 0, &ec
);
1258 if (len
== 0) return start
;
1264 makeCanonSetFn(void *context
, uint32_t code
, Norm
*norm
) {
1265 if(norm
->canonStart
!=NULL
&& !uset_isEmpty(norm
->canonStart
)) {
1267 int32_t c
, tableLength
;
1268 UErrorCode errorCode
=U_ZERO_ERROR
;
1270 /* does the set contain exactly one code point? */
1271 c
=usetContainsOne(norm
->canonStart
); /* ### why? */
1273 /* add an entry to the BMP or supplementary search table */
1275 table
=canonStartSets
+_NORM_MAX_CANON_SETS
;
1276 tableLength
=canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1278 table
[tableLength
++]=(uint16_t)code
;
1280 if(c
>=0 && c
<=0xffff && (c
&_NORM_CANON_SET_BMP_MASK
)!=_NORM_CANON_SET_BMP_IS_INDEX
) {
1281 /* single-code point BMP result for BMP code point */
1282 table
[tableLength
++]=(uint16_t)c
;
1284 table
[tableLength
++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX
|canonStartSetsTop
);
1287 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]=(uint16_t)tableLength
;
1289 table
=canonStartSets
+_NORM_MAX_CANON_SETS
+_NORM_MAX_SET_SEARCH_TABLE_LENGTH
;
1290 tableLength
=canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
];
1292 table
[tableLength
++]=(uint16_t)(code
>>16);
1293 table
[tableLength
++]=(uint16_t)code
;
1296 /* single-code point result for supplementary code point */
1297 table
[tableLength
-2]|=(uint16_t)(0x8000|((c
>>8)&0x1f00)); /* ### how does this work again? */
1298 table
[tableLength
++]=(uint16_t)c
;
1300 table
[tableLength
++]=(uint16_t)canonStartSetsTop
;
1302 canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]=(uint16_t)tableLength
;
1306 /* write a USerializedSet */
1309 uset_serialize(norm
->canonStart
,
1310 canonStartSets
+canonStartSetsTop
,
1311 _NORM_MAX_CANON_SETS
-canonStartSetsTop
,
1314 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]=(uint16_t)canonStartSetsTop
;
1316 if(U_FAILURE(errorCode
)) {
1317 fprintf(stderr
, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode
), (int)canonStartSetsTop
);
1320 if(tableLength
>_NORM_MAX_SET_SEARCH_TABLE_LENGTH
) {
1321 fprintf(stderr
, "gennorm error: search table for canonical starter sets too long\n");
1322 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
1327 /* for getSkippableFlags ---------------------------------------------------- */
1329 /* combine the lead and trail code points; return <0 if they do not combine */
1331 combine(uint32_t lead
, uint32_t trail
) {
1332 CombiningTriple
*triples
;
1335 /* search for all triples with c as lead code point */
1336 triples
=utm_getStart(combiningTriplesMem
);
1337 count
=utm_countItems(combiningTriplesMem
);
1339 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1340 for(i
=0; i
<count
&& lead
!=triples
[i
].lead
; ++i
) {}
1342 /* check each triple for this code point */
1343 for(; i
<count
&& lead
==triples
[i
].lead
; ++i
) {
1344 if(trail
==triples
[i
].trail
) {
1345 return (int32_t)triples
[i
].combined
;
1353 * Starting from the canonical decomposition s[0..length[ of a single code point,
1354 * is the code point c consumed in an NFC/FCC recomposition?
1356 * No need to handle discontiguous composition because that would not consume some
1357 * intermediate character, so would not compose back to the original character.
1358 * See comments in canChangeWithFollowing().
1360 * No need to compose beyond where c canonically orders because if it is consumed
1361 * then the result differs from the original anyway.
1363 * Possible optimization:
1364 * - Verify that there are no cases of the same combining mark stacking twice.
1365 * - return FALSE right away if c inserts after a copy of itself
1366 * without attempting to recompose; will happen because each mark in
1367 * the decomposition will be enumerated and passed in as c.
1368 * More complicated and fragile though than it is already.
1373 doesComposeConsume(const uint32_t *s
, int32_t length
, uint32_t c
, uint8_t cc
) {
1376 /* ignore trailing characters where cc<prevCC */
1377 while(length
>1 && cc
<getCCFromCP(s
[length
-1])) {
1381 /* start consuming/combining from the beginning */
1382 starter
=(int32_t)s
[0];
1383 for(i
=1; i
<length
; ++i
) {
1384 starter
=combine((uint32_t)starter
, s
[i
]);
1386 fprintf(stderr
, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
1387 (int)s
[0], (int)s
[1], (int)length
, (int)c
, cc
);
1388 exit(U_INTERNAL_PROGRAM_ERROR
);
1392 /* try to combine/consume c, return TRUE if it is consumed */
1393 return combine((uint32_t)starter
, c
)>=0;
1396 /* does the starter s[0] combine forward with another char that is below trailCC? */
1398 canChangeWithFollowing(const uint32_t *s
, int32_t length
, uint8_t trailCC
) {
1400 /* no character will combine ahead of the trailing char of the decomposition */
1405 * We are only checking skippable condition (f).
1406 * Therefore, the original character does not have quick check flag NFC_NO (c),
1407 * i.e., the decomposition recomposes completely back into the original code point.
1408 * So s[0] must be a true starter with cc==0 and
1409 * combining with following code points.
1411 * Similarly, length==1 is not possible because that would be a singleton
1412 * decomposition which is marked with NFC_NO and does not pass (c).
1414 * Only a character with cc<trailCC can change the composition.
1415 * Reason: A char with cc>=trailCC would order after decomposition s[],
1416 * composition would consume all of the decomposition, and here we know that
1417 * the original char passed check d), i.e., it does not combine forward,
1418 * therefore does not combine with anything after the decomposition is consumed.
1420 * Now see if there is a character that
1421 * 1. combines backward
1423 * 3. is consumed in recomposition
1425 * length==2 is simple:
1427 * Characters that fulfill these conditions are exactly the ones that combine directly
1428 * with the starter c==s[0] because there is no intervening character after
1430 * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
1431 * and see if one has cc<trailCC (passes 2.).
1433 * length>2 is a little harder:
1435 * Since we will get different starters during recomposition, we need to
1436 * enumerate each backward-combining character (1.)
1437 * with cc<trailCC (2.) and
1438 * see if it gets consumed in recomposition. (3.)
1439 * No need to enumerate both-ways combining characters because they must have cc==0.
1442 /* enumerate all chars that combine with this one and check their cc */
1443 CombiningTriple
*triples
;
1444 uint32_t c
, i
, count
;
1447 /* search for all triples with c as lead code point */
1448 triples
=utm_getStart(combiningTriplesMem
);
1449 count
=utm_countItems(combiningTriplesMem
);
1452 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1453 for(i
=0; i
<count
&& c
!=triples
[i
].lead
; ++i
) {}
1455 /* check each triple for this code point */
1456 for(; i
<count
&& c
==triples
[i
].lead
; ++i
) {
1457 cc
=getCCFromCP(triples
[i
].trail
);
1458 if(cc
>0 && cc
<trailCC
) {
1459 /* this trail code point combines with c and has cc<trailCC */
1464 /* enumerate all chars that combine backward */
1469 for(i
=combineBothTop
; i
<combineBackTop
; ++i
) {
1470 c2
=combiningCPs
[i
]&0xffffff;
1472 /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
1473 if(cc
>0 && cc
<trailCC
&& doesComposeConsume(s
, length
-1, c2
, cc
)) {
1479 /* this decomposition is not modified by any appended character */
1483 /* see unormimp.h for details on NF*C Skippable flags */
1485 getSkippableFlags(const Norm
*norm
) {
1486 /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
1488 /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
1489 if(norm
->specialTag
==_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_HANGUL
) {
1493 /* ### check other data generation functions whether they should & do ignore Hangul/Jamo specials */
1497 * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
1499 * This means that (a)..(e) must always be derived from the runtime norm32 value,
1500 * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
1501 * the form is NF*C and there is a canonical decomposition (NFD_NO).
1503 * (a) unassigned code points get "not skippable"==false because they
1504 * don't have a Norm struct so they won't get here
1507 /* (b) not skippable if cc!=0 */
1508 if(norm
->udataCC
!=0) {
1509 return 0; /* non-zero flag for (f) only */
1513 * not NFC_Skippable if
1514 * (c) quick check flag == NO or
1515 * (d) combines forward or
1516 * (e) combines back or
1517 * (f) can change if another character is added
1520 * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
1521 * check its composition list,
1522 * see if any of the second code points in the list
1523 * has cc less than the trailCC of the decomposition.
1525 * For FCC: Test at runtime if the decomposition has a trailCC>1
1526 * -> there are characters with cc==1, they would order before the trail char
1527 * and prevent contiguous combination with the trail char.
1529 if( (norm
->qcFlags
&(_NORM_QC_NFC
&_NORM_QC_ANY_NO
))!=0 ||
1530 (norm
->combiningFlags
&3)!=0) {
1531 return 0; /* non-zero flag for (f) only */
1533 if(norm
->lenNFD
!=0 && canChangeWithFollowing(norm
->nfd
, norm
->lenNFD
, (uint8_t)norm
->canonBothCCs
)) {
1534 return _NORM_AUX_NFC_SKIP_F_MASK
;
1537 return 0; /* skippable */
1546 pData
=utrie_getData(auxTrie
, &length
);
1548 for(i
=0; i
<length
; ++i
) {
1549 norm
=norms
+pData
[i
];
1551 * 16-bit auxiliary normalization properties
1555 ((uint32_t)(norm
->combiningFlags
&0x80)<<(_NORM_AUX_COMP_EX_SHIFT
-7))|
1556 (uint32_t)norm
->fncIndex
;
1558 if(norm
->unsafeStart
|| norm
->udataCC
!=0) {
1559 pData
[i
]|=_NORM_AUX_UNSAFE_MASK
;
1562 pData
[i
]|=getSkippableFlags(norm
);
1566 /* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
1567 static uint32_t U_CALLCONV
1568 getFoldedNormValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
1569 uint32_t value
, leadNorm32
=0;
1574 while(start
<limit
) {
1575 value
=utrie_get32(trie
, start
, &inBlockZero
);
1577 start
+=UTRIE_DATA_BLOCK_LENGTH
;
1586 /* turn multi-bit fields into the worst-case value */
1587 if(leadNorm32
&_NORM_CC_MASK
) {
1588 leadNorm32
|=_NORM_CC_MASK
;
1591 /* clean up unnecessarily ored bit fields */
1592 leadNorm32
&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT
);
1595 /* nothing to do (only composition exclusions?) */
1599 /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
1601 (uint32_t)_NORM_EXTRA_INDEX_TOP
+
1602 (uint32_t)((offset
-UTRIE_BMP_INDEX_LENGTH
)>>UTRIE_SURROGATE_BLOCK_BITS
)
1603 )<<_NORM_EXTRA_SHIFT
;
1608 /* folding value for FCD: just store the offset (16 bits) if there is any non-0 entry */
1609 static uint32_t U_CALLCONV
1610 getFoldedFCDValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
1616 while(start
<limit
) {
1617 value
=utrie_get32(trie
, start
, &inBlockZero
);
1619 start
+=UTRIE_DATA_BLOCK_LENGTH
;
1620 } else if(value
!=0) {
1621 return (uint32_t)offset
;
1630 * folding value for auxiliary data:
1631 * store the non-zero offset in bits 9..0 (FNC bits)
1632 * if there is any non-0 entry;
1633 * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
1635 static uint32_t U_CALLCONV
1636 getFoldedAuxValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
1637 uint32_t value
, oredValues
;
1643 while(start
<limit
) {
1644 value
=utrie_get32(trie
, start
, &inBlockZero
);
1646 start
+=UTRIE_DATA_BLOCK_LENGTH
;
1654 /* move the 10 significant offset bits into bits 9..0 */
1655 offset
>>=UTRIE_SURROGATE_BLOCK_BITS
;
1656 if(offset
>_NORM_AUX_FNC_MASK
) {
1657 fprintf(stderr
, "gennorm error: folding offset too large (auxTrie)\n");
1658 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
1660 return (uint32_t)offset
|(oredValues
&~_NORM_AUX_FNC_MASK
);
1674 /* canonically reorder decompositions and assign combining classes for decompositions */
1675 enumTrie(postParseFn
, NULL
);
1678 for(i
=1; i
<64; ++i
) {
1679 if(combineAndQC
[i
]) {
1680 printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i
&0xc)>>2, i
&0x33);
1685 /* add hangul/jamo specials */
1686 setHangulJamoSpecials();
1688 /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
1689 enumTrie(makeCanonSetFn
, NULL
);
1691 /* clone the normalization builder trie to make the final data tries */
1692 if( NULL
==utrie_clone(norm32Trie
, normTrie
, NULL
, 0) ||
1693 NULL
==utrie_clone(fcdTrie
, normTrie
, NULL
, 0) ||
1694 NULL
==utrie_clone(auxTrie
, normTrie
, NULL
, 0)
1696 fprintf(stderr
, "error: unable to clone the normalization trie\n");
1697 exit(U_MEMORY_ALLOCATION_ERROR
);
1700 /* --- finalize data for quick checks & normalization --- */
1702 /* turn the Norm structs (stage2, norms) into 32-bit data words */
1705 /* --- finalize data for FCD checks --- */
1707 /* FCD data: take Norm.canonBothCCs and store them in the FCD table */
1710 /* --- finalize auxiliary normalization data --- */
1715 printf("number of stage 2 entries: %ld\n", stage2Mem
->index
);
1716 printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT
*2+stage2Mem
->index
*4+extraMem
->index
*2);
1718 printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop
, combineBothTop
, combineBackTop
);
1719 printf("combining table count: %u\n", combiningTableTop
);
1723 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1726 generateData(const char *dataDir
) {
1727 static uint8_t normTrieBlock
[100000], fcdTrieBlock
[100000], auxTrieBlock
[100000];
1729 UNewDataMemory
*pData
;
1730 UErrorCode errorCode
=U_ZERO_ERROR
;
1731 int32_t size
, dataLength
;
1733 #if UCONFIG_NO_NORMALIZATION
1739 U_STRING_DECL(nxCJKCompatPattern
, "[:Ideographic:]", 15);
1740 U_STRING_DECL(nxUnicode32Pattern
, "[:^Age=3.2:]", 12);
1742 int32_t normTrieSize
, fcdTrieSize
, auxTrieSize
;
1744 normTrieSize
=utrie_serialize(norm32Trie
, normTrieBlock
, sizeof(normTrieBlock
), getFoldedNormValue
, FALSE
, &errorCode
);
1745 if(U_FAILURE(errorCode
)) {
1746 fprintf(stderr
, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode
));
1750 fcdTrieSize
=utrie_serialize(fcdTrie
, fcdTrieBlock
, sizeof(fcdTrieBlock
), getFoldedFCDValue
, TRUE
, &errorCode
);
1751 if(U_FAILURE(errorCode
)) {
1752 fprintf(stderr
, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode
));
1756 auxTrieSize
=utrie_serialize(auxTrie
, auxTrieBlock
, sizeof(auxTrieBlock
), getFoldedAuxValue
, TRUE
, &errorCode
);
1757 if(U_FAILURE(errorCode
)) {
1758 fprintf(stderr
, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode
));
1762 /* move the parts of canonStartSets[] together into a contiguous block */
1763 if(canonStartSetsTop
<_NORM_MAX_CANON_SETS
) {
1764 uprv_memmove(canonStartSets
+canonStartSetsTop
,
1765 canonStartSets
+_NORM_MAX_CANON_SETS
,
1766 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]*2);
1768 canonStartSetsTop
+=canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1770 if(canonStartSetsTop
<(_NORM_MAX_CANON_SETS
+_NORM_MAX_SET_SEARCH_TABLE_LENGTH
)) {
1771 uprv_memmove(canonStartSets
+canonStartSetsTop
,
1772 canonStartSets
+_NORM_MAX_CANON_SETS
+_NORM_MAX_SET_SEARCH_TABLE_LENGTH
,
1773 canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]*2);
1775 canonStartSetsTop
+=canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
];
1777 /* create the normalization exclusion sets */
1779 * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
1780 * but we cannot use NFD_QC from the pattern because that would require
1781 * unorm.icu which we are just going to generate.
1782 * Therefore we have manually collected nfdQCNoSet and intersect Ideographic
1785 U_STRING_INIT(nxCJKCompatPattern
, "[:Ideographic:]", 15);
1786 U_STRING_INIT(nxUnicode32Pattern
, "[:^Age=3.2:]", 12);
1788 canonStartSets
[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET
]=canonStartSetsTop
;
1789 set
=uset_openPattern(nxCJKCompatPattern
, -1, &errorCode
);
1790 if(U_FAILURE(errorCode
)) {
1791 fprintf(stderr
, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode
));
1794 uset_retainAll(set
, nfdQCNoSet
);
1795 canonStartSetsTop
+=uset_serialize(set
, canonStartSets
+canonStartSetsTop
, LENGTHOF(canonStartSets
)-canonStartSetsTop
, &errorCode
);
1796 if(U_FAILURE(errorCode
)) {
1797 fprintf(stderr
, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode
));
1802 canonStartSets
[_NORM_SET_INDEX_NX_UNICODE32_OFFSET
]=canonStartSetsTop
;
1803 set
=uset_openPattern(nxUnicode32Pattern
, -1, &errorCode
);
1804 if(U_FAILURE(errorCode
)) {
1805 fprintf(stderr
, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode
));
1808 canonStartSetsTop
+=uset_serialize(set
, canonStartSets
+canonStartSetsTop
, LENGTHOF(canonStartSets
)-canonStartSetsTop
, &errorCode
);
1809 if(U_FAILURE(errorCode
)) {
1810 fprintf(stderr
, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode
));
1815 canonStartSets
[_NORM_SET_INDEX_NX_RESERVED_OFFSET
]=canonStartSetsTop
;
1817 /* make sure that the FCD trie is 4-aligned */
1818 if((utm_countItems(extraMem
)+combiningTableTop
)&1) {
1819 combiningTable
[combiningTableTop
++]=0x1234; /* add one 16-bit word for an even number */
1822 /* pad canonStartSets to 4-alignment, too */
1823 if(canonStartSetsTop
&1) {
1824 canonStartSets
[canonStartSetsTop
++]=0x1235;
1830 utm_countItems(extraMem
)*2+
1831 combiningTableTop
*2+
1834 canonStartSetsTop
*2;
1837 printf("size of normalization trie %5u bytes\n", (int)normTrieSize
);
1838 printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem
));
1839 printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem
))[0]);
1840 printf("size of combining table %5u uint16_t\n", combiningTableTop
);
1841 printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize
);
1842 printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize
);
1843 printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop
);
1844 printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP
);
1845 printf(" size of sets %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]-_NORM_SET_INDEX_TOP
);
1846 printf(" number of sets %5d\n", (int)canonSetsCount
);
1847 printf(" size of BMP search table %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]);
1848 printf(" size of supplementary search table %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]);
1849 printf(" length of exclusion sets %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_NX_RESERVED_OFFSET
]-canonStartSets
[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET
]);
1850 printf("size of " U_ICUDATA_NAME
"_" DATA_NAME
"." DATA_TYPE
" contents: %ld bytes\n", (long)size
);
1853 indexes
[_NORM_INDEX_TRIE_SIZE
]=normTrieSize
;
1854 indexes
[_NORM_INDEX_UCHAR_COUNT
]=(uint16_t)utm_countItems(extraMem
);
1856 indexes
[_NORM_INDEX_COMBINE_DATA_COUNT
]=combiningTableTop
;
1857 indexes
[_NORM_INDEX_COMBINE_FWD_COUNT
]=combineFwdTop
;
1858 indexes
[_NORM_INDEX_COMBINE_BOTH_COUNT
]=(uint16_t)(combineBothTop
-combineFwdTop
);
1859 indexes
[_NORM_INDEX_COMBINE_BACK_COUNT
]=(uint16_t)(combineBackTop
-combineBothTop
);
1861 /* the quick check minimum code points are already set */
1863 indexes
[_NORM_INDEX_FCD_TRIE_SIZE
]=fcdTrieSize
;
1864 indexes
[_NORM_INDEX_AUX_TRIE_SIZE
]=auxTrieSize
;
1865 indexes
[_NORM_INDEX_CANON_SET_COUNT
]=canonStartSetsTop
;
1869 /* write the data */
1870 pData
=udata_create(dataDir
, DATA_TYPE
, DATA_NAME
, &dataInfo
,
1871 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, &errorCode
);
1872 if(U_FAILURE(errorCode
)) {
1873 fprintf(stderr
, "gennorm: unable to create the output file, error %d\n", errorCode
);
1877 #if !UCONFIG_NO_NORMALIZATION
1879 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
1880 udata_writeBlock(pData
, normTrieBlock
, normTrieSize
);
1881 udata_writeBlock(pData
, utm_getStart(extraMem
), utm_countItems(extraMem
)*2);
1882 udata_writeBlock(pData
, combiningTable
, combiningTableTop
*2);
1883 udata_writeBlock(pData
, fcdTrieBlock
, fcdTrieSize
);
1884 udata_writeBlock(pData
, auxTrieBlock
, auxTrieSize
);
1885 udata_writeBlock(pData
, canonStartSets
, canonStartSetsTop
*2);
1890 dataLength
=udata_finish(pData
, &errorCode
);
1891 if(U_FAILURE(errorCode
)) {
1892 fprintf(stderr
, "gennorm: error %d writing the output file\n", errorCode
);
1896 if(dataLength
!=size
) {
1897 fprintf(stderr
, "gennorm error: data length %ld != calculated size %ld\n",
1898 (long)dataLength
, (long)size
);
1899 exit(U_INTERNAL_PROGRAM_ERROR
);
1903 #if !UCONFIG_NO_NORMALIZATION
1909 count
=utm_countItems(normMem
);
1910 for(i
=0; i
<count
; ++i
) {
1911 uset_close(norms
[i
].canonStart
);
1915 utm_close(utf32Mem
);
1916 utm_close(extraMem
);
1917 utm_close(combiningTriplesMem
);
1918 utrie_close(normTrie
);
1919 utrie_close(norm32Trie
);
1920 utrie_close(fcdTrie
);
1921 utrie_close(auxTrie
);
1923 uset_close(nfdQCNoSet
);
1925 uprv_free(normTrie
);
1926 uprv_free(norm32Trie
);
1931 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1934 * Hey, Emacs, please set the following:
1937 * indent-tabs-mode: nil