2 *******************************************************************************
4 * Copyright (C) 1999-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2001may25
14 * created by: Markus W. Scherer
16 * Store Unicode normalization data in a memory-mappable file.
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
27 #include "unicode/udata.h"
29 #include "unicode/uset.h"
36 #define DO_DEBUG_OUT 0
38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41 * The new implementation of the normalization code loads its data from
42 * unorm.icu, which is generated with this gennorm tool.
43 * The format of that file is described in unormimp.h .
46 /* file data ---------------------------------------------------------------- */
48 #if UCONFIG_NO_NORMALIZATION
50 /* dummy UDataInfo cf. udata.h */
51 static UDataInfo dataInfo
= {
60 { 0, 0, 0, 0 }, /* dummy dataFormat */
61 { 0, 0, 0, 0 }, /* dummy formatVersion */
62 { 0, 0, 0, 0 } /* dummy dataVersion */
67 /* UDataInfo cf. udata.h */
68 static UDataInfo dataInfo
={
77 { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
78 { 2, 3, UTRIE_SHIFT
, UTRIE_INDEX_SHIFT
}, /* formatVersion */
79 { 3, 2, 0, 0 } /* dataVersion (Unicode version) */
83 setUnicodeVersion(const char *v
) {
85 u_versionFromString(version
, v
);
86 uprv_memcpy(dataInfo
.dataVersion
, version
, 4);
89 static int32_t indexes
[_NORM_INDEX_TOP
]={ 0 };
91 /* builder data ------------------------------------------------------------- */
93 /* modularization flags, see gennorm.h (default to "store everything") */
94 uint32_t gStoreFlags
=0xffffffff;
96 typedef void EnumTrieFn(void *context
, uint32_t code
, Norm
*norm
);
104 static UToolMemory
*normMem
, *utf32Mem
, *extraMem
, *combiningTriplesMem
;
109 * set a flag for each code point that was seen in decompositions -
110 * avoid to decompose ones that have not been used before
112 static uint32_t haveSeenFlags
[256];
114 /* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
115 static USet
*nfdQCNoSet
;
117 /* see addCombiningCP() for details */
118 static uint32_t combiningCPs
[2000];
121 * after processCombining() this contains for each code point in combiningCPs[]
122 * the runtime combining index
124 static uint16_t combiningIndexes
[2000];
126 /* section limits for combiningCPs[], see addCombiningCP() */
127 static uint16_t combineFwdTop
=0, combineBothTop
=0, combineBackTop
=0;
130 * Structure for a triple of code points, stored in combiningTriplesMem.
131 * The lead and trail code points combine into the the combined one,
132 * i.e., there is a canonical decomposition of combined-> <lead, trail>.
134 * Before processCombining() is called, leadIndex and trailIndex are 0.
135 * After processCombining(), they contain the indexes of the lead and trail
136 * code point in the combiningCPs[] array.
137 * They are then sorted by leadIndex, then trailIndex.
138 * They are not sorted by code points.
140 typedef struct CombiningTriple
{
141 uint16_t leadIndex
, trailIndex
;
142 uint32_t lead
, trail
, combined
;
145 /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
146 static uint16_t combiningTable
[0x8000];
147 static uint16_t combiningTableTop
=0;
149 #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
150 static uint16_t canonStartSets
[_NORM_MAX_CANON_SETS
+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
151 +10000]; /* +10000 for exclusion sets */
152 static int32_t canonStartSetsTop
=_NORM_SET_INDEX_TOP
;
153 static int32_t canonSetsCount
=0;
155 /* allocate and initialize a Norm unit */
159 Norm
*p
=(Norm
*)utm_alloc(normMem
);
161 * The combiningIndex must not be initialized to 0 because 0 is the
162 * combiningIndex of the first forward-combining character.
164 p
->combiningIndex
=0xffff;
172 normTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
173 uprv_memset(normTrie
, 0, sizeof(UNewTrie
));
174 norm32Trie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
175 uprv_memset(norm32Trie
, 0, sizeof(UNewTrie
));
176 fcdTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
177 uprv_memset(fcdTrie
, 0, sizeof(UNewTrie
));
178 auxTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
179 uprv_memset(auxTrie
, 0, sizeof(UNewTrie
));
181 /* initialize the two tries */
182 if(NULL
==utrie_open(normTrie
, NULL
, 30000, 0, 0, FALSE
)) {
183 fprintf(stderr
, "error: failed to initialize tries\n");
184 exit(U_MEMORY_ALLOCATION_ERROR
);
187 /* allocate Norm structures and reset the first one */
188 normMem
=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm
));
191 /* allocate UTF-32 string memory */
192 utf32Mem
=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
194 /* reset all "have seen" flags */
195 uprv_memset(haveSeenFlags
, 0, sizeof(haveSeenFlags
));
197 /* open an empty set */
198 nfdQCNoSet
=uset_open(1, 0);
200 /* allocate extra data memory for UTF-16 decomposition strings and other values */
201 extraMem
=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP
, _NORM_EXTRA_INDEX_TOP
, 2);
202 /* initialize the extraMem counter for the top of FNC strings */
203 p16
=(uint16_t *)utm_alloc(extraMem
);
206 /* allocate temporary memory for combining triples */
207 combiningTriplesMem
=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple
));
209 /* set the minimum code points for no/maybe quick check values to the end of the BMP */
210 indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
]=0xffff;
211 indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
]=0xffff;
212 indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]=0xffff;
213 indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
]=0xffff;
215 /* preset the indexes portion of canonStartSets */
216 uprv_memset(canonStartSets
, 0, _NORM_SET_INDEX_TOP
*2);
220 * get or create a Norm unit;
221 * get or create the intermediate trie entries for it as well
224 createNorm(uint32_t code
) {
228 i
=utrie_get32(normTrie
, (UChar32
)code
, NULL
);
234 if(!utrie_set32(normTrie
, (UChar32
)code
, (uint32_t)(p
-norms
))) {
235 fprintf(stderr
, "error: too many normalization entries\n");
236 exit(U_BUFFER_OVERFLOW_ERROR
);
242 /* get an existing Norm unit */
244 getNorm(uint32_t code
) {
247 i
=utrie_get32(normTrie
, (UChar32
)code
, NULL
);
254 /* get the canonical combining class of a character */
256 getCCFromCP(uint32_t code
) {
257 Norm
*norm
=getNorm(code
);
261 return norm
->udataCC
;
266 * enumerate all code points with their Norm structs and call a function for each
267 * return the number of code points with data
270 enumTrie(EnumTrieFn
*fn
, void *context
) {
276 for(code
=0; code
<=0x10ffff;) {
277 i
=utrie_get32(normTrie
, code
, &isInBlockZero
);
279 code
+=UTRIE_DATA_BLOCK_LENGTH
;
282 fn(context
, (uint32_t)code
, norms
+i
);
292 setHaveSeenString(const uint32_t *s
, int32_t length
) {
297 haveSeenFlags
[(c
>>5)&0xff]|=(1<<(c
&0x1f));
302 #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
304 /* handle combining data ---------------------------------------------------- */
307 * Insert an entry into combiningCPs[] for the new code point code with its flags.
308 * The flags indicate if code combines forward, backward, or both.
310 * combiningCPs[] contains three sections:
311 * 1. code points that combine forward
312 * 2. code points that combine forward and backward
313 * 3. code points that combine backward
315 * Search for code in the entire array.
316 * If it is found and already is in the right section (old flags==new flags)
318 * If it is found but the flags are different, then remove it,
319 * union the old and new flags, and reinsert it into its correct section.
320 * If it is not found, then just insert it.
322 * Within each section, the code points are not sorted.
325 addCombiningCP(uint32_t code
, uint8_t flags
) {
329 newEntry
=code
|((uint32_t)flags
<<24);
331 /* search for this code point */
332 for(i
=0; i
<combineBackTop
; ++i
) {
333 if(code
==(combiningCPs
[i
]&0xffffff)) {
335 if(newEntry
==combiningCPs
[i
]) {
336 return; /* no change */
339 /* combine the flags, remove the old entry from the old place, and insert the new one */
340 newEntry
|=combiningCPs
[i
];
341 if(i
!=--combineBackTop
) {
342 uprv_memmove(combiningCPs
+i
, combiningCPs
+i
+1, (combineBackTop
-i
)*4);
344 if(i
<combineBothTop
) {
347 if(i
<combineFwdTop
) {
354 /* not found or modified, insert it */
355 if(combineBackTop
>=sizeof(combiningCPs
)/4) {
356 fprintf(stderr
, "error: gennorm combining code points - trying to use more than %ld units\n",
357 (long)(sizeof(combiningCPs
)/4));
358 exit(U_MEMORY_ALLOCATION_ERROR
);
361 /* set i to the insertion point */
362 flags
=(uint8_t)(newEntry
>>24);
366 } else if(flags
==3) {
368 } else /* flags==2 */ {
372 /* move the following code points up one and insert newEntry at i */
373 if(i
<combineBackTop
) {
374 uprv_memmove(combiningCPs
+i
+1, combiningCPs
+i
, (combineBackTop
-i
)*4);
376 combiningCPs
[i
]=newEntry
;
378 /* finally increment the total counter */
383 * Find the index in combiningCPs[] where code point code is stored.
384 * @param code code point to look for
385 * @param isLead is code a forward combining code point?
386 * @return index in combiningCPs[] where code is stored
389 findCombiningCP(uint32_t code
, UBool isLead
) {
394 limit
=combineBothTop
;
397 limit
=combineBackTop
;
400 /* search for this code point */
401 for(; i
<limit
; ++i
) {
402 if(code
==(combiningCPs
[i
]&0xffffff)) {
413 addCombiningTriple(uint32_t lead
, uint32_t trail
, uint32_t combined
) {
414 CombiningTriple
*triple
;
416 if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION
)) {
421 * set combiningFlags for the two code points
422 * do this after decomposition so that getNorm() above returns NULL
423 * if we do not have actual sub-decomposition data for the initial NFD here
425 createNorm(lead
)->combiningFlags
|=1; /* combines forward */
426 createNorm(trail
)->combiningFlags
|=2; /* combines backward */
428 addCombiningCP(lead
, 1);
429 addCombiningCP(trail
, 2);
431 triple
=(CombiningTriple
*)utm_alloc(combiningTriplesMem
);
434 triple
->combined
=combined
;
438 compareTriples(const void *l
, const void *r
) {
440 diff
=(int)((CombiningTriple
*)l
)->leadIndex
-
441 (int)((CombiningTriple
*)r
)->leadIndex
;
443 diff
=(int)((CombiningTriple
*)l
)->trailIndex
-
444 (int)((CombiningTriple
*)r
)->trailIndex
;
451 CombiningTriple
*triples
;
454 uint16_t i
, j
, count
, tableTop
, finalIndex
, combinesFwd
;
456 triples
=utm_getStart(combiningTriplesMem
);
458 /* add lead and trail indexes to the triples for sorting */
459 count
=(uint16_t)utm_countItems(combiningTriplesMem
);
460 for(i
=0; i
<count
; ++i
) {
461 /* findCombiningCP() must always find the code point */
462 triples
[i
].leadIndex
=findCombiningCP(triples
[i
].lead
, TRUE
);
463 triples
[i
].trailIndex
=findCombiningCP(triples
[i
].trail
, FALSE
);
466 /* sort them by leadIndex, trailIndex */
467 qsort(triples
, count
, sizeof(CombiningTriple
), compareTriples
);
469 /* calculate final combining indexes and store them in the Norm entries */
471 j
=0; /* triples counter */
473 /* first, combining indexes of fwd/both characters are indexes into the combiningTable */
474 for(i
=0; i
<combineBothTop
; ++i
) {
475 /* start a new table */
477 /* assign combining index */
478 createNorm(combiningCPs
[i
]&0xffffff)->combiningIndex
=combiningIndexes
[i
]=tableTop
;
480 /* calculate the length of the combining data for this lead code point in the combiningTable */
481 while(j
<count
&& i
==triples
[j
].leadIndex
) {
482 /* count 2 to 3 16-bit units per composition entry (back-index, code point) */
483 combined
=triples
[j
++].combined
;
484 if(combined
<=0x1fff) {
492 /* second, combining indexes of back-only characters are simply incremented from here to be unique */
494 for(; i
<combineBackTop
; ++i
) {
495 createNorm(combiningCPs
[i
]&0xffffff)->combiningIndex
=combiningIndexes
[i
]=finalIndex
++;
498 /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
499 if(finalIndex
>0x8000) {
500 fprintf(stderr
, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
501 tableTop
, (long)(sizeof(combiningTable
)/4));
502 exit(U_MEMORY_ALLOCATION_ERROR
);
505 combiningTableTop
=tableTop
;
507 /* store the combining data in the combiningTable, with the final indexes from above */
509 j
=0; /* triples counter */
512 * this is essentially the same loop as above, but
513 * it writes the table data instead of calculating and setting the final indexes;
514 * it is necessary to have two passes so that all the final indexes are known before
515 * they are written into the table
517 for(i
=0; i
<combineBothTop
; ++i
) {
518 /* start a new table */
520 combined
=0; /* avoid compiler warning */
522 /* store the combining data for this lead code point in the combiningTable */
523 while(j
<count
&& i
==triples
[j
].leadIndex
) {
524 finalIndex
=combiningIndexes
[triples
[j
].trailIndex
];
525 combined
=triples
[j
++].combined
;
527 /* is combined a starter? (i.e., cc==0 && combines forward) */
528 combinesFwd
=(uint16_t)((getNorm(combined
)->combiningFlags
&1)<<13);
531 if(combined
<=0x1fff) {
532 *p
++=(uint16_t)(combinesFwd
|combined
);
533 } else if(combined
<=0xffff) {
534 *p
++=(uint16_t)(0x8000|combinesFwd
);
535 *p
++=(uint16_t)combined
;
537 *p
++=(uint16_t)(0xc000|combinesFwd
|((combined
-0x10000)>>10));
538 *p
++=(uint16_t)(0xdc00|(combined
&0x3ff));
542 /* set a marker on the last final trail index in this lead's table */
543 if(combined
<=0x1fff) {
550 /* post condition: tableTop==(p-combiningTable) */
553 /* processing incoming normalization data ----------------------------------- */
556 * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
557 * c must be a Hangul syllable code point.
560 getHangulDecomposition(uint32_t c
, Norm
*pHangulNorm
, uint32_t hangulBuffer
[3]) {
561 /* Hangul syllable: decompose algorithmically */
565 uprv_memset(pHangulNorm
, 0, sizeof(Norm
));
572 hangulBuffer
[2]=JAMO_T_BASE
+c2
;
579 hangulBuffer
[1]=JAMO_V_BASE
+c%JAMO_V_COUNT
;
580 hangulBuffer
[0]=JAMO_L_BASE
+c
/JAMO_V_COUNT
;
582 pHangulNorm
->nfd
=hangulBuffer
;
583 pHangulNorm
->lenNFD
=length
;
584 if(DO_STORE(UGENNORM_STORE_COMPAT
)) {
585 pHangulNorm
->nfkd
=hangulBuffer
;
586 pHangulNorm
->lenNFKD
=length
;
591 * decompose the one decomposition further, may generate two decompositions
592 * apply all previous characters' decompositions to this one
595 decompStoreNewNF(uint32_t code
, Norm
*norm
) {
596 uint32_t nfd
[40], nfkd
[40], hangulBuffer
[3];
603 uint8_t lenNFD
=0, lenNFKD
=0;
604 UBool changedNFD
=FALSE
, changedNFKD
=FALSE
;
606 if((length
=norm
->lenNFD
)!=0) {
607 /* always allocate the original string */
610 } else if((length
=norm
->lenNFKD
)!=0) {
611 /* always allocate the original string */
615 /* no decomposition here, nothing to do */
619 /* decompose each code point */
620 for(i
=0; i
<length
; ++i
) {
624 if(HANGUL_BASE
<=c
&& c
<(HANGUL_BASE
+HANGUL_COUNT
)) {
625 getHangulDecomposition(c
, &hangulNorm
, hangulBuffer
);
628 /* no data, no decomposition */
635 /* canonically decompose c */
638 uprv_memcpy(nfd
+lenNFD
, p
->nfd
, p
->lenNFD
*4);
645 /* compatibility-decompose c */
647 uprv_memcpy(nfkd
+lenNFKD
, p
->nfkd
, p
->lenNFKD
*4);
650 } else if(p
->lenNFD
!=0) {
651 uprv_memcpy(nfkd
+lenNFKD
, p
->nfd
, p
->lenNFD
*4);
654 * not changedNFKD=TRUE;
655 * so that we do not store a new nfkd if there was no nfkd string before
656 * and we only see canonical decompositions
663 /* assume that norm->lenNFD==1 or ==2 */
664 if(norm
->lenNFD
==2 && !(norm
->combiningFlags
&0x80)) {
665 addCombiningTriple(s32
[0], s32
[1], code
);
670 s32
=utm_allocN(utf32Mem
, lenNFD
);
671 uprv_memcpy(s32
, nfd
, lenNFD
*4);
677 setHaveSeenString(nfd
, lenNFD
);
681 s32
=utm_allocN(utf32Mem
, lenNFKD
);
682 uprv_memcpy(s32
, nfkd
, lenNFKD
*4);
686 norm
->lenNFKD
=lenNFKD
;
688 setHaveSeenString(nfkd
, lenNFKD
);
692 typedef struct DecompSingle
{
698 * apply this one character's decompositions (there is at least one!) to
699 * all previous characters' decompositions to decompose them further
702 decompWithSingleFn(void *context
, uint32_t code
, Norm
*norm
) {
703 uint32_t nfd
[40], nfkd
[40];
705 DecompSingle
*me
=(DecompSingle
*)context
;
708 uint8_t lenNFD
=0, lenNFKD
=0, myLenNFD
, myLenNFKD
;
709 UBool changedNFD
=FALSE
, changedNFKD
=FALSE
;
711 /* get the new character's data */
713 myLenNFD
=me
->norm
->lenNFD
;
714 myLenNFKD
=me
->norm
->lenNFKD
;
715 /* assume that myC has at least one decomposition */
717 if((length
=norm
->lenNFD
)!=0 && myLenNFD
!=0) {
718 /* apply NFD(myC) to norm->nfd */
720 for(i
=0; i
<length
; ++i
) {
723 uprv_memcpy(nfd
+lenNFD
, me
->norm
->nfd
, myLenNFD
*4);
732 if((length
=norm
->lenNFKD
)!=0) {
733 /* apply NFD(myC) and NFKD(myC) to norm->nfkd */
735 for(i
=0; i
<length
; ++i
) {
739 uprv_memcpy(nfkd
+lenNFKD
, me
->norm
->nfkd
, myLenNFKD
*4);
741 } else /* assume myLenNFD!=0 */ {
742 uprv_memcpy(nfkd
+lenNFKD
, me
->norm
->nfd
, myLenNFD
*4);
750 } else if((length
=norm
->lenNFD
)!=0 && myLenNFKD
!=0) {
751 /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
753 for(i
=0; i
<length
; ++i
) {
756 uprv_memcpy(nfkd
+lenNFKD
, me
->norm
->nfkd
, myLenNFKD
*4);
765 /* set the new decompositions, forget the old ones */
768 if(lenNFD
>norm
->lenNFD
) {
769 s32
=utm_allocN(utf32Mem
, lenNFD
);
773 uprv_memcpy(s32
, nfd
, lenNFD
*4);
782 if(lenNFKD
>norm
->lenNFKD
) {
783 s32
=utm_allocN(utf32Mem
, lenNFKD
);
787 uprv_memcpy(s32
, nfkd
, lenNFKD
*4);
791 norm
->lenNFKD
=lenNFKD
;
797 * process the data for one code point listed in UnicodeData;
798 * UnicodeData itself never maps a code point to both NFD and NFKD
801 storeNorm(uint32_t code
, Norm
*norm
) {
802 DecompSingle decompSingle
;
805 if(DO_NOT_STORE(UGENNORM_STORE_COMPAT
)) {
806 /* ignore compatibility decomposition */
810 /* copy existing derived normalization properties */
812 norm
->qcFlags
=p
->qcFlags
;
813 norm
->combiningFlags
=p
->combiningFlags
;
814 norm
->fncIndex
=p
->fncIndex
;
816 /* process the decomposition if there is one here */
817 if((norm
->lenNFD
|norm
->lenNFKD
)!=0) {
818 /* decompose this one decomposition further, may generate two decompositions */
819 decompStoreNewNF(code
, norm
);
821 /* has this code point been used in previous decompositions? */
822 if(HAVE_SEEN(code
)) {
823 /* use this decomposition to decompose other decompositions further */
825 decompSingle
.norm
=norm
;
826 enumTrie(decompWithSingleFn
, &decompSingle
);
831 uprv_memcpy(p
, norm
, sizeof(Norm
));
835 setQCFlags(uint32_t code
, uint8_t qcFlags
) {
836 if(DO_NOT_STORE(UGENNORM_STORE_COMPAT
)) {
837 /* ignore compatibility decomposition: unset the KC/KD flags */
838 qcFlags
&=~(_NORM_QC_NFKC
|_NORM_QC_NFKD
);
840 /* set the KC/KD flags to the same values as the C/D flags */
843 if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION
)) {
844 /* ignore composition data: unset the C/KC flags */
845 qcFlags
&=~(_NORM_QC_NFC
|_NORM_QC_NFKC
);
847 /* set the C/KC flags to the same values as the D/KD flags */
851 createNorm(code
)->qcFlags
|=qcFlags
;
853 /* adjust the minimum code point for quick check no/maybe */
855 if((qcFlags
&_NORM_QC_NFC
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
]) {
856 indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
]=(uint16_t)code
;
858 if((qcFlags
&_NORM_QC_NFKC
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
]) {
859 indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
]=(uint16_t)code
;
861 if((qcFlags
&_NORM_QC_NFD
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]) {
862 indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]=(uint16_t)code
;
864 if((qcFlags
&_NORM_QC_NFKD
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
]) {
865 indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
]=(uint16_t)code
;
869 if(qcFlags
&_NORM_QC_NFD
) {
870 uset_add(nfdQCNoSet
, (UChar32
)code
);
875 setCompositionExclusion(uint32_t code
) {
876 if(DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
877 createNorm(code
)->combiningFlags
|=0x80;
882 setHangulJamoSpecials() {
887 * Hangul syllables are algorithmically decomposed into Jamos,
888 * and Jamos are algorithmically composed into Hangul syllables.
889 * The quick check flags are parsed, except for Hangul.
892 /* set Jamo L specials */
894 for(c
=0x1100; c
<=0x1112; ++c
) {
896 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_JAMO_L
;
897 if(DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
898 norm
->combiningFlags
=1;
901 /* for each Jamo L create a set with its associated Hangul block */
902 norm
->canonStart
=uset_open(hangul
, hangul
+21*28-1);
906 /* set Jamo V specials */
907 for(c
=0x1161; c
<=0x1175; ++c
) {
909 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_JAMO_V
;
910 if(DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
911 norm
->combiningFlags
=2;
913 norm
->unsafeStart
=TRUE
;
916 /* set Jamo T specials */
917 for(c
=0x11a8; c
<=0x11c2; ++c
) {
919 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_JAMO_T
;
920 if(DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
921 norm
->combiningFlags
=2;
923 norm
->unsafeStart
=TRUE
;
926 /* set Hangul specials, precompacted */
928 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_HANGUL
;
929 if(DO_STORE(UGENNORM_STORE_COMPAT
)) {
930 norm
->qcFlags
=_NORM_QC_NFD
|_NORM_QC_NFKD
;
932 norm
->qcFlags
=_NORM_QC_NFD
;
935 if(!utrie_setRange32(normTrie
, 0xac00, 0xd7a4, (uint32_t)(norm
-norms
), TRUE
)) {
936 fprintf(stderr
, "error: too many normalization entries (setting Hangul)\n");
937 exit(U_BUFFER_OVERFLOW_ERROR
);
942 * set FC-NFKC-Closure string
943 * s contains the closure string; s[0]==length, s[1..length] is the actual string
947 setFNC(uint32_t c
, UChar
*s
) {
949 int32_t length
, i
, count
;
952 if( DO_NOT_STORE(UGENNORM_STORE_COMPAT
) ||
953 DO_NOT_STORE(UGENNORM_STORE_COMPOSITION
) ||
954 DO_NOT_STORE(UGENNORM_STORE_AUX
)
959 count
=utm_countItems(extraMem
);
963 /* try to overlay single-unit strings with existing ones */
964 if(length
==1 && first
<0xff00) {
965 p
=utm_getStart(extraMem
);
966 for(i
=1; i
<count
; ++i
) {
975 /* append the new string if it cannot be overlayed with an old one */
977 if(count
>_NORM_AUX_MAX_FNC
) {
978 fprintf(stderr
, "gennorm error: too many FNC strings\n");
979 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
982 /* prepend 0xffxx with xx==length */
983 s
[0]=(uint16_t)(0xff00+length
);
985 p
=(uint16_t *)utm_allocN(extraMem
, length
);
986 uprv_memcpy(p
, s
, length
*2);
988 /* update the top index in extraMem[0] */
990 ((uint16_t *)utm_getStart(extraMem
))[0]=(uint16_t)count
;
993 /* store the index to the string */
994 createNorm(c
)->fncIndex
=i
;
997 /* build runtime structures ------------------------------------------------- */
999 /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
1001 reorderString(uint32_t *s
, int32_t length
) {
1011 for(i
=0; i
<length
; ++i
) {
1012 /* get the i-th code point and its combining class */
1016 /* it is a combining mark, see if it needs to be moved back */
1021 break; /* found the right place */
1023 /* move the previous code point here and go back */
1030 /* just store the combining class */
1035 return (uint16_t)(((uint16_t)ccs
[0]<<8)|ccs
[length
-1]);
1039 static UBool combineAndQC
[64]={ 0 };
1043 * canonically reorder the up to two decompositions
1044 * and store the leading and trailing combining classes accordingly
1046 * also process canonical decompositions for canonical closure
1049 postParseFn(void *context
, uint32_t code
, Norm
*norm
) {
1052 /* canonically order the NFD */
1053 length
=norm
->lenNFD
;
1055 norm
->canonBothCCs
=reorderString(norm
->nfd
, length
);
1058 /* canonically reorder the NFKD */
1059 length
=norm
->lenNFKD
;
1061 norm
->compatBothCCs
=reorderString(norm
->nfkd
, length
);
1064 /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
1065 if((norm
->lenNFD
!=0) != ((norm
->qcFlags
&_NORM_QC_NFD
)!=0)) {
1066 fprintf(stderr
, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code
, norm
->lenNFD
, norm
->qcFlags
);
1068 if(((norm
->lenNFD
|norm
->lenNFKD
)!=0) != ((norm
->qcFlags
&(_NORM_QC_NFD
|_NORM_QC_NFKD
))!=0)) {
1069 fprintf(stderr
, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code
, norm
->lenNFD
, norm
->lenNFKD
, norm
->qcFlags
);
1072 /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
1074 combineAndQC
[(norm
->qcFlags
&0x33)|((norm
->combiningFlags
&3)<<2)]=1;
1077 if(norm
->combiningFlags
&1) {
1078 if(norm
->udataCC
!=0) {
1079 /* illegal - data-derivable composition exclusion */
1080 fprintf(stderr
, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code
, norm
->udataCC
);
1083 if(norm
->combiningFlags
&2) {
1084 if((norm
->qcFlags
&0x11)==0) {
1085 fprintf(stderr
, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code
);
1088 /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
1089 if(norm
->udataCC
==0) {
1090 printf("U+%04lx combines backward but udataCC==0\n", (long)code
);
1094 if((norm
->combiningFlags
&3)==3 && beVerbose
) {
1095 printf("U+%04lx combines both ways\n", (long)code
);
1099 * process canonical decompositions for canonical closure
1101 * in each canonical decomposition:
1102 * add the current character (code) to the set of canonical starters of its norm->nfd[0]
1103 * set the "unsafe starter" flag for each norm->nfd[1..]
1105 length
=norm
->lenNFD
;
1111 /* nfd[0].canonStart.add(code) */
1113 otherNorm
=createNorm(c
);
1114 if(otherNorm
->canonStart
==NULL
) {
1115 otherNorm
->canonStart
=uset_open(code
, code
);
1116 if(otherNorm
->canonStart
==NULL
) {
1117 fprintf(stderr
, "gennorm error: out of memory in uset_open()\n");
1118 exit(U_MEMORY_ALLOCATION_ERROR
);
1121 uset_add(otherNorm
->canonStart
, code
);
1122 if(!uset_contains(otherNorm
->canonStart
, code
)) {
1123 fprintf(stderr
, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c
, (int)code
);
1124 exit(U_INTERNAL_PROGRAM_ERROR
);
1128 /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
1129 for(i
=1; i
<length
; ++i
) {
1130 createNorm(norm
->nfd
[i
])->unsafeStart
=TRUE
;
1136 make32BitNorm(Norm
*norm
) {
1140 int32_t i
, length
, beforeZero
=0, count
, start
;
1143 * Check for assumptions:
1145 * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
1146 * then the decomposition also begins with a true starter.
1148 if(norm
->udataCC
==0) {
1149 /* this is a starter */
1150 if((norm
->qcFlags
&_NORM_QC_NFC
)==0 && norm
->lenNFD
>0) {
1151 /* a "true" NFC starter with a canonical decomposition */
1152 if( norm
->canonBothCCs
>=0x100 || /* lead cc!=0 or */
1153 ((other
=getNorm(norm
->nfd
[0]))!=NULL
&& (other
->qcFlags
&_NORM_QC_NFC
)!=0) /* nfd[0] not NFC_YES */
1156 "error: true NFC starter canonical decomposition[%u] does not begin\n"
1157 " with a true NFC starter: U+%04lx U+%04lx%s\n",
1158 norm
->lenNFD
, (long)norm
->nfd
[0], (long)norm
->nfd
[1],
1159 norm
->lenNFD
<=2 ? "" : " ...");
1160 exit(U_INVALID_TABLE_FILE
);
1164 if((norm
->qcFlags
&_NORM_QC_NFKC
)==0) {
1165 if(norm
->lenNFKD
>0) {
1166 /* a "true" NFKC starter with a compatibility decomposition */
1167 if( norm
->compatBothCCs
>=0x100 || /* lead cc!=0 or */
1168 ((other
=getNorm(norm
->nfkd
[0]))!=NULL
&& (other
->qcFlags
&_NORM_QC_NFKC
)!=0) /* nfkd[0] not NFKC_YES */
1171 "error: true NFKC starter compatibility decomposition[%u] does not begin\n"
1172 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1173 norm
->lenNFKD
, (long)norm
->nfkd
[0], (long)norm
->nfkd
[1],
1174 norm
->lenNFKD
<=2 ? "" : " ...");
1175 exit(U_INVALID_TABLE_FILE
);
1177 } else if(norm
->lenNFD
>0) {
1178 /* a "true" NFKC starter with only a canonical decomposition */
1179 if( norm
->canonBothCCs
>=0x100 || /* lead cc!=0 or */
1180 ((other
=getNorm(norm
->nfd
[0]))!=NULL
&& (other
->qcFlags
&_NORM_QC_NFKC
)!=0) /* nfd[0] not NFKC_YES */
1183 "error: true NFKC starter canonical decomposition[%u] does not begin\n"
1184 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1185 norm
->lenNFD
, (long)norm
->nfd
[0], (long)norm
->nfd
[1],
1186 norm
->lenNFD
<=2 ? "" : " ...");
1187 exit(U_INVALID_TABLE_FILE
);
1193 /* reset the 32-bit word and set the quick check flags */
1196 /* set the UnicodeData combining class */
1197 word
|=(uint32_t)norm
->udataCC
<<_NORM_CC_SHIFT
;
1199 /* set the combining flag and index */
1200 if(norm
->combiningFlags
&3) {
1201 word
|=(uint32_t)(norm
->combiningFlags
&3)<<6;
1204 /* set the combining index value into the extra data */
1205 /* 0xffff: no combining index; 0..0x7fff: combining index */
1206 if(norm
->combiningIndex
!=0xffff) {
1207 extra
[0]=norm
->combiningIndex
;
1213 /* write the decompositions */
1214 if((norm
->lenNFD
|norm
->lenNFKD
)!=0) {
1215 extra
[count
++]=0; /* set the pieces when available, into extra[beforeZero] */
1217 length
=norm
->lenNFD
;
1219 if(norm
->canonBothCCs
!=0) {
1220 extra
[beforeZero
]|=0x80;
1221 extra
[count
++]=norm
->canonBothCCs
;
1224 for(i
=0; i
<length
; ++i
) {
1225 UTF_APPEND_CHAR_UNSAFE(extra
, count
, norm
->nfd
[i
]);
1227 extra
[beforeZero
]|=(UChar
)(count
-start
); /* set the decomp length as the number of UTF-16 code units */
1230 length
=norm
->lenNFKD
;
1232 if(norm
->compatBothCCs
!=0) {
1233 extra
[beforeZero
]|=0x8000;
1234 extra
[count
++]=norm
->compatBothCCs
;
1237 for(i
=0; i
<length
; ++i
) {
1238 UTF_APPEND_CHAR_UNSAFE(extra
, count
, norm
->nfkd
[i
]);
1240 extra
[beforeZero
]|=(UChar
)((count
-start
)<<8); /* set the decomp length as the number of UTF-16 code units */
1244 /* allocate and copy the extra data */
1248 if(norm
->specialTag
!=0) {
1249 fprintf(stderr
, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm
->specialTag
);
1250 exit(U_ILLEGAL_ARGUMENT_ERROR
);
1253 p
=(UChar
*)utm_allocN(extraMem
, count
);
1254 uprv_memcpy(p
, extra
, count
*2);
1256 /* set the extra index, offset by beforeZero */
1257 word
|=(uint32_t)(beforeZero
+(p
-(UChar
*)utm_getStart(extraMem
)))<<_NORM_EXTRA_SHIFT
;
1258 } else if(norm
->specialTag
!=0) {
1259 /* set a special tag instead of an extra index */
1260 word
|=(uint32_t)norm
->specialTag
<<_NORM_EXTRA_SHIFT
;
1266 /* turn all Norm structs into corresponding 32-bit norm values */
1269 uint32_t *pNormData
;
1271 int32_t i
, normLength
, count
;
1273 count
=(int32_t)utm_countItems(normMem
);
1274 for(i
=0; i
<count
; ++i
) {
1275 norms
[i
].value32
=make32BitNorm(norms
+i
);
1278 pNormData
=utrie_getData(norm32Trie
, &normLength
);
1280 count
=0; /* count is now just used for debugging */
1281 for(i
=0; i
<normLength
; ++i
) {
1283 if(0!=(pNormData
[i
]=norms
[n
].value32
)) {
1290 * extract all Norm.canonBothCCs into the FCD table
1291 * set 32-bit values to use the common fold and compact functions
1297 int32_t i
, count
, fcdLength
;
1300 count
=utm_countItems(normMem
);
1301 for(i
=0; i
<count
; ++i
) {
1302 bothCCs
=norms
[i
].canonBothCCs
;
1304 /* if there are no decomposition cc's then use the udataCC twice */
1305 bothCCs
=norms
[i
].udataCC
;
1306 bothCCs
|=bothCCs
<<8;
1308 norms
[i
].value32
=bothCCs
;
1311 pFCDData
=utrie_getData(fcdTrie
, &fcdLength
);
1313 for(i
=0; i
<fcdLength
; ++i
) {
1315 pFCDData
[i
]=norms
[n
].value32
;
1320 * If the given set contains exactly one character, then return it.
1321 * Otherwise return -1.
1324 usetContainsOne(const USet
* set
) {
1325 if(uset_getItemCount(set
)==1) {
1326 /* there is a single item (a single range) */
1328 UErrorCode ec
=U_ZERO_ERROR
;
1329 int32_t len
=uset_getItem(set
, 0, &start
, &end
, NULL
, 0, &ec
);
1330 if (len
==0 && start
==end
) { /* a range (len==0) with a single code point */
1338 makeCanonSetFn(void *context
, uint32_t code
, Norm
*norm
) {
1339 if(norm
->canonStart
!=NULL
&& !uset_isEmpty(norm
->canonStart
)) {
1341 int32_t c
, tableLength
;
1342 UErrorCode errorCode
=U_ZERO_ERROR
;
1344 /* does the set contain exactly one code point? */
1345 c
=usetContainsOne(norm
->canonStart
);
1347 /* add an entry to the BMP or supplementary search table */
1349 table
=canonStartSets
+_NORM_MAX_CANON_SETS
;
1350 tableLength
=canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1352 table
[tableLength
++]=(uint16_t)code
;
1354 if(c
>=0 && c
<=0xffff && (c
&_NORM_CANON_SET_BMP_MASK
)!=_NORM_CANON_SET_BMP_IS_INDEX
) {
1355 /* single-code point BMP result for BMP code point */
1356 table
[tableLength
++]=(uint16_t)c
;
1358 table
[tableLength
++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX
|canonStartSetsTop
);
1361 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]=(uint16_t)tableLength
;
1363 table
=canonStartSets
+_NORM_MAX_CANON_SETS
+_NORM_MAX_SET_SEARCH_TABLE_LENGTH
;
1364 tableLength
=canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
];
1366 table
[tableLength
++]=(uint16_t)(code
>>16);
1367 table
[tableLength
++]=(uint16_t)code
;
1370 /* single-code point result for supplementary code point */
1371 table
[tableLength
-2]|=(uint16_t)(0x8000|((c
>>8)&0x1f00));
1372 table
[tableLength
++]=(uint16_t)c
;
1374 table
[tableLength
++]=(uint16_t)canonStartSetsTop
;
1376 canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]=(uint16_t)tableLength
;
1380 /* write a USerializedSet */
1383 uset_serialize(norm
->canonStart
,
1384 canonStartSets
+canonStartSetsTop
,
1385 _NORM_MAX_CANON_SETS
-canonStartSetsTop
,
1388 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]=(uint16_t)canonStartSetsTop
;
1390 if(U_FAILURE(errorCode
)) {
1391 fprintf(stderr
, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode
), (int)canonStartSetsTop
);
1394 if(tableLength
>_NORM_MAX_SET_SEARCH_TABLE_LENGTH
) {
1395 fprintf(stderr
, "gennorm error: search table for canonical starter sets too long\n");
1396 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
1401 /* for getSkippableFlags ---------------------------------------------------- */
1403 /* combine the lead and trail code points; return <0 if they do not combine */
1405 combine(uint32_t lead
, uint32_t trail
) {
1406 CombiningTriple
*triples
;
1409 /* search for all triples with c as lead code point */
1410 triples
=utm_getStart(combiningTriplesMem
);
1411 count
=utm_countItems(combiningTriplesMem
);
1413 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1414 for(i
=0; i
<count
&& lead
!=triples
[i
].lead
; ++i
) {}
1416 /* check each triple for this code point */
1417 for(; i
<count
&& lead
==triples
[i
].lead
; ++i
) {
1418 if(trail
==triples
[i
].trail
) {
1419 return (int32_t)triples
[i
].combined
;
1427 * Starting from the canonical decomposition s[0..length[ of a single code point,
1428 * is the code point c consumed in an NFC/FCC recomposition?
1430 * No need to handle discontiguous composition because that would not consume some
1431 * intermediate character, so would not compose back to the original character.
1432 * See comments in canChangeWithFollowing().
1434 * No need to compose beyond where c canonically orders because if it is consumed
1435 * then the result differs from the original anyway.
1437 * Possible optimization:
1438 * - Verify that there are no cases of the same combining mark stacking twice.
1439 * - return FALSE right away if c inserts after a copy of itself
1440 * without attempting to recompose; will happen because each mark in
1441 * the decomposition will be enumerated and passed in as c.
1442 * More complicated and fragile though than it is already.
1447 doesComposeConsume(const uint32_t *s
, int32_t length
, uint32_t c
, uint8_t cc
) {
1450 /* ignore trailing characters where cc<prevCC */
1451 while(length
>1 && cc
<getCCFromCP(s
[length
-1])) {
1455 /* start consuming/combining from the beginning */
1456 starter
=(int32_t)s
[0];
1457 for(i
=1; i
<length
; ++i
) {
1458 starter
=combine((uint32_t)starter
, s
[i
]);
1460 fprintf(stderr
, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
1461 (int)s
[0], (int)s
[1], (int)length
, (int)c
, cc
);
1462 exit(U_INTERNAL_PROGRAM_ERROR
);
1466 /* try to combine/consume c, return TRUE if it is consumed */
1467 return combine((uint32_t)starter
, c
)>=0;
1470 /* does the starter s[0] combine forward with another char that is below trailCC? */
1472 canChangeWithFollowing(const uint32_t *s
, int32_t length
, uint8_t trailCC
) {
1474 /* no character will combine ahead of the trailing char of the decomposition */
1479 * We are only checking skippable condition (f).
1480 * Therefore, the original character does not have quick check flag NFC_NO (c),
1481 * i.e., the decomposition recomposes completely back into the original code point.
1482 * So s[0] must be a true starter with cc==0 and
1483 * combining with following code points.
1485 * Similarly, length==1 is not possible because that would be a singleton
1486 * decomposition which is marked with NFC_NO and does not pass (c).
1488 * Only a character with cc<trailCC can change the composition.
1489 * Reason: A char with cc>=trailCC would order after decomposition s[],
1490 * composition would consume all of the decomposition, and here we know that
1491 * the original char passed check d), i.e., it does not combine forward,
1492 * therefore does not combine with anything after the decomposition is consumed.
1494 * Now see if there is a character that
1495 * 1. combines backward
1497 * 3. is consumed in recomposition
1499 * length==2 is simple:
1501 * Characters that fulfill these conditions are exactly the ones that combine directly
1502 * with the starter c==s[0] because there is no intervening character after
1504 * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
1505 * and see if one has cc<trailCC (passes 2.).
1507 * length>2 is a little harder:
1509 * Since we will get different starters during recomposition, we need to
1510 * enumerate each backward-combining character (1.)
1511 * with cc<trailCC (2.) and
1512 * see if it gets consumed in recomposition. (3.)
1513 * No need to enumerate both-ways combining characters because they must have cc==0.
1516 /* enumerate all chars that combine with this one and check their cc */
1517 CombiningTriple
*triples
;
1518 uint32_t c
, i
, count
;
1521 /* search for all triples with c as lead code point */
1522 triples
=utm_getStart(combiningTriplesMem
);
1523 count
=utm_countItems(combiningTriplesMem
);
1526 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1527 for(i
=0; i
<count
&& c
!=triples
[i
].lead
; ++i
) {}
1529 /* check each triple for this code point */
1530 for(; i
<count
&& c
==triples
[i
].lead
; ++i
) {
1531 cc
=getCCFromCP(triples
[i
].trail
);
1532 if(cc
>0 && cc
<trailCC
) {
1533 /* this trail code point combines with c and has cc<trailCC */
1538 /* enumerate all chars that combine backward */
1543 for(i
=combineBothTop
; i
<combineBackTop
; ++i
) {
1544 c2
=combiningCPs
[i
]&0xffffff;
1546 /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
1547 if(cc
>0 && cc
<trailCC
&& doesComposeConsume(s
, length
-1, c2
, cc
)) {
1553 /* this decomposition is not modified by any appended character */
1557 /* see unormimp.h for details on NF*C Skippable flags */
1559 getSkippableFlags(const Norm
*norm
) {
1560 /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
1562 /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
1563 if(norm
->specialTag
==_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_HANGUL
) {
1567 /* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */
1571 * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
1573 * This means that (a)..(e) must always be derived from the runtime norm32 value,
1574 * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
1575 * the form is NF*C and there is a canonical decomposition (NFD_NO).
1577 * (a) unassigned code points get "not skippable"==false because they
1578 * don't have a Norm struct so they won't get here
1581 /* (b) not skippable if cc!=0 */
1582 if(norm
->udataCC
!=0) {
1583 return 0; /* non-zero flag for (f) only */
1587 * not NFC_Skippable if
1588 * (c) quick check flag == NO or
1589 * (d) combines forward or
1590 * (e) combines back or
1591 * (f) can change if another character is added
1594 * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
1595 * check its composition list,
1596 * see if any of the second code points in the list
1597 * has cc less than the trailCC of the decomposition.
1599 * For FCC: Test at runtime if the decomposition has a trailCC>1
1600 * -> there are characters with cc==1, they would order before the trail char
1601 * and prevent contiguous combination with the trail char.
1603 if( (norm
->qcFlags
&(_NORM_QC_NFC
&_NORM_QC_ANY_NO
))!=0 ||
1604 (norm
->combiningFlags
&3)!=0) {
1605 return 0; /* non-zero flag for (f) only */
1607 if(norm
->lenNFD
!=0 && canChangeWithFollowing(norm
->nfd
, norm
->lenNFD
, (uint8_t)norm
->canonBothCCs
)) {
1608 return _NORM_AUX_NFC_SKIP_F_MASK
;
1611 return 0; /* skippable */
1620 pData
=utrie_getData(auxTrie
, &length
);
1622 for(i
=0; i
<length
; ++i
) {
1623 norm
=norms
+pData
[i
];
1625 * 16-bit auxiliary normalization properties
1629 ((uint32_t)(norm
->combiningFlags
&0x80)<<(_NORM_AUX_COMP_EX_SHIFT
-7))|
1630 (uint32_t)norm
->fncIndex
;
1632 if(norm
->unsafeStart
|| norm
->udataCC
!=0) {
1633 pData
[i
]|=_NORM_AUX_UNSAFE_MASK
;
1636 pData
[i
]|=getSkippableFlags(norm
);
1640 /* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
1641 static uint32_t U_CALLCONV
1642 getFoldedNormValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
1643 uint32_t value
, leadNorm32
=0;
1648 while(start
<limit
) {
1649 value
=utrie_get32(trie
, start
, &inBlockZero
);
1651 start
+=UTRIE_DATA_BLOCK_LENGTH
;
1660 /* turn multi-bit fields into the worst-case value */
1661 if(leadNorm32
&_NORM_CC_MASK
) {
1662 leadNorm32
|=_NORM_CC_MASK
;
1665 /* clean up unnecessarily ored bit fields */
1666 leadNorm32
&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT
);
1669 /* nothing to do (only composition exclusions?) */
1673 /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
1675 (uint32_t)_NORM_EXTRA_INDEX_TOP
+
1676 (uint32_t)((offset
-UTRIE_BMP_INDEX_LENGTH
)>>UTRIE_SURROGATE_BLOCK_BITS
)
1677 )<<_NORM_EXTRA_SHIFT
;
1682 /* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */
1685 * folding value for auxiliary data:
1686 * store the non-zero offset in bits 9..0 (FNC bits)
1687 * if there is any non-0 entry;
1688 * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
1690 static uint32_t U_CALLCONV
1691 getFoldedAuxValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
1692 uint32_t value
, oredValues
;
1698 while(start
<limit
) {
1699 value
=utrie_get32(trie
, start
, &inBlockZero
);
1701 start
+=UTRIE_DATA_BLOCK_LENGTH
;
1709 /* move the 10 significant offset bits into bits 9..0 */
1710 offset
>>=UTRIE_SURROGATE_BLOCK_BITS
;
1711 if(offset
>_NORM_AUX_FNC_MASK
) {
1712 fprintf(stderr
, "gennorm error: folding offset too large (auxTrie)\n");
1713 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
1715 return (uint32_t)offset
|(oredValues
&~_NORM_AUX_FNC_MASK
);
1729 /* canonically reorder decompositions and assign combining classes for decompositions */
1730 enumTrie(postParseFn
, NULL
);
1733 for(i
=1; i
<64; ++i
) {
1734 if(combineAndQC
[i
]) {
1735 printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i
&0xc)>>2, i
&0x33);
1740 /* add hangul/jamo specials */
1741 setHangulJamoSpecials();
1743 /* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */
1744 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]=(uint16_t)canonStartSetsTop
;
1746 /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
1747 if(DO_STORE(UGENNORM_STORE_AUX
) && DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
1748 enumTrie(makeCanonSetFn
, NULL
);
1751 /* clone the normalization builder trie to make the final data tries */
1752 if( NULL
==utrie_clone(norm32Trie
, normTrie
, NULL
, 0) ||
1753 NULL
==utrie_clone(fcdTrie
, normTrie
, NULL
, 0) ||
1754 NULL
==utrie_clone(auxTrie
, normTrie
, NULL
, 0)
1756 fprintf(stderr
, "error: unable to clone the normalization trie\n");
1757 exit(U_MEMORY_ALLOCATION_ERROR
);
1760 /* --- finalize data for quick checks & normalization --- */
1762 /* turn the Norm structs (stage2, norms) into 32-bit data words */
1765 /* --- finalize data for FCD checks --- */
1767 /* FCD data: take Norm.canonBothCCs and store them in the FCD table */
1770 /* --- finalize auxiliary normalization data --- */
1775 printf("number of stage 2 entries: %ld\n", stage2Mem
->index
);
1776 printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT
*2+stage2Mem
->index
*4+extraMem
->index
*2);
1778 printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop
, combineBothTop
, combineBackTop
);
1779 printf("combining table count: %u\n", combiningTableTop
);
1783 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1786 generateData(const char *dataDir
, UBool csource
) {
1787 static uint8_t normTrieBlock
[100000], fcdTrieBlock
[100000], auxTrieBlock
[100000];
1789 UNewDataMemory
*pData
;
1790 UErrorCode errorCode
=U_ZERO_ERROR
;
1791 int32_t size
, dataLength
;
1793 #if UCONFIG_NO_NORMALIZATION
1799 U_STRING_DECL(nxCJKCompatPattern
, "[:Ideographic:]", 15);
1800 U_STRING_DECL(nxUnicode32Pattern
, "[:^Age=3.2:]", 12);
1802 int32_t normTrieSize
, fcdTrieSize
, auxTrieSize
;
1804 normTrieSize
=utrie_serialize(norm32Trie
, normTrieBlock
, sizeof(normTrieBlock
), getFoldedNormValue
, FALSE
, &errorCode
);
1805 if(U_FAILURE(errorCode
)) {
1806 fprintf(stderr
, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode
));
1810 if(DO_STORE(UGENNORM_STORE_FCD
)) {
1811 fcdTrieSize
=utrie_serialize(fcdTrie
, fcdTrieBlock
, sizeof(fcdTrieBlock
), NULL
, TRUE
, &errorCode
);
1812 if(U_FAILURE(errorCode
)) {
1813 fprintf(stderr
, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode
));
1820 if(DO_STORE(UGENNORM_STORE_AUX
)) {
1821 auxTrieSize
=utrie_serialize(auxTrie
, auxTrieBlock
, sizeof(auxTrieBlock
), getFoldedAuxValue
, TRUE
, &errorCode
);
1822 if(U_FAILURE(errorCode
)) {
1823 fprintf(stderr
, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode
));
1830 /* move the parts of canonStartSets[] together into a contiguous block */
1831 if( canonStartSetsTop
<_NORM_MAX_CANON_SETS
&&
1832 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]!=0
1834 uprv_memmove(canonStartSets
+canonStartSetsTop
,
1835 canonStartSets
+_NORM_MAX_CANON_SETS
,
1836 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]*2);
1838 canonStartSetsTop
+=canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1840 if( canonStartSetsTop
<(_NORM_MAX_CANON_SETS
+_NORM_MAX_SET_SEARCH_TABLE_LENGTH
) &&
1841 canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]!=0
1843 uprv_memmove(canonStartSets
+canonStartSetsTop
,
1844 canonStartSets
+_NORM_MAX_CANON_SETS
+_NORM_MAX_SET_SEARCH_TABLE_LENGTH
,
1845 canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]*2);
1847 canonStartSetsTop
+=canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
];
1849 /* create the normalization exclusion sets */
1851 * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
1852 * but we cannot use NFD_QC from the pattern because that would require
1853 * unorm.icu which we are just going to generate.
1854 * Therefore we have manually collected nfdQCNoSet and intersect Ideographic
1857 U_STRING_INIT(nxCJKCompatPattern
, "[:Ideographic:]", 15);
1858 U_STRING_INIT(nxUnicode32Pattern
, "[:^Age=3.2:]", 12);
1860 canonStartSets
[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET
]=canonStartSetsTop
;
1861 set
=uset_openPattern(nxCJKCompatPattern
, -1, &errorCode
);
1862 if(U_FAILURE(errorCode
)) {
1863 fprintf(stderr
, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode
));
1866 uset_retainAll(set
, nfdQCNoSet
);
1867 if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS
)) {
1870 canonStartSetsTop
+=uset_serialize(set
, canonStartSets
+canonStartSetsTop
, LENGTHOF(canonStartSets
)-canonStartSetsTop
, &errorCode
);
1871 if(U_FAILURE(errorCode
)) {
1872 fprintf(stderr
, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode
));
1877 canonStartSets
[_NORM_SET_INDEX_NX_UNICODE32_OFFSET
]=canonStartSetsTop
;
1878 set
=uset_openPattern(nxUnicode32Pattern
, -1, &errorCode
);
1879 if(U_FAILURE(errorCode
)) {
1880 fprintf(stderr
, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode
));
1883 if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS
)) {
1886 canonStartSetsTop
+=uset_serialize(set
, canonStartSets
+canonStartSetsTop
, LENGTHOF(canonStartSets
)-canonStartSetsTop
, &errorCode
);
1887 if(U_FAILURE(errorCode
)) {
1888 fprintf(stderr
, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode
));
1893 canonStartSets
[_NORM_SET_INDEX_NX_RESERVED_OFFSET
]=canonStartSetsTop
;
1895 /* make sure that the FCD trie is 4-aligned */
1896 if((utm_countItems(extraMem
)+combiningTableTop
)&1) {
1897 combiningTable
[combiningTableTop
++]=0x1234; /* add one 16-bit word for an even number */
1900 /* pad canonStartSets to 4-alignment, too */
1901 if(canonStartSetsTop
&1) {
1902 canonStartSets
[canonStartSetsTop
++]=0x1235;
1908 utm_countItems(extraMem
)*2+
1909 combiningTableTop
*2+
1912 canonStartSetsTop
*2;
1915 printf("size of normalization trie %5u bytes\n", (int)normTrieSize
);
1916 printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem
));
1917 printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem
))[0]);
1918 printf("size of combining table %5u uint16_t\n", combiningTableTop
);
1919 printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize
);
1920 printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize
);
1921 printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop
);
1922 printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP
);
1923 printf(" size of sets %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]-_NORM_SET_INDEX_TOP
);
1924 printf(" number of sets %5d\n", (int)canonSetsCount
);
1925 printf(" size of BMP search table %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]);
1926 printf(" size of supplementary search table %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]);
1927 printf(" length of exclusion sets %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_NX_RESERVED_OFFSET
]-canonStartSets
[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET
]);
1928 printf("size of " U_ICUDATA_NAME
"_" DATA_NAME
"." DATA_TYPE
" contents: %ld bytes\n", (long)size
);
1931 indexes
[_NORM_INDEX_TRIE_SIZE
]=normTrieSize
;
1932 indexes
[_NORM_INDEX_UCHAR_COUNT
]=(uint16_t)utm_countItems(extraMem
);
1934 indexes
[_NORM_INDEX_COMBINE_DATA_COUNT
]=combiningTableTop
;
1935 indexes
[_NORM_INDEX_COMBINE_FWD_COUNT
]=combineFwdTop
;
1936 indexes
[_NORM_INDEX_COMBINE_BOTH_COUNT
]=(uint16_t)(combineBothTop
-combineFwdTop
);
1937 indexes
[_NORM_INDEX_COMBINE_BACK_COUNT
]=(uint16_t)(combineBackTop
-combineBothTop
);
1939 /* the quick check minimum code points are already set */
1941 indexes
[_NORM_INDEX_FCD_TRIE_SIZE
]=fcdTrieSize
;
1942 indexes
[_NORM_INDEX_AUX_TRIE_SIZE
]=auxTrieSize
;
1943 indexes
[_NORM_INDEX_CANON_SET_COUNT
]=canonStartSetsTop
;
1948 #if UCONFIG_NO_NORMALIZATION
1949 /* no csource for dummy mode..? */
1950 fprintf(stderr
, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n");
1953 /* write .c file for hardcoded data */
1954 UTrie normTrie2
={ NULL
}, fcdTrie2
={ NULL
}, auxTrie2
={ NULL
};
1957 utrie_unserialize(&normTrie2
, normTrieBlock
, normTrieSize
, &errorCode
);
1959 utrie_unserialize(&fcdTrie2
, fcdTrieBlock
, fcdTrieSize
, &errorCode
);
1962 utrie_unserialize(&auxTrie2
, auxTrieBlock
, auxTrieSize
, &errorCode
);
1964 if(U_FAILURE(errorCode
)) {
1967 "gennorm error: failed to utrie_unserialize() one of the tries - %s\n",
1968 u_errorName(errorCode
));
1972 f
=usrc_create(dataDir
, "unorm_props_data.c");
1975 "static const UVersionInfo formatVersion={ ",
1976 dataInfo
.formatVersion
, 8, 4,
1979 "static const UVersionInfo dataVersion={ ",
1980 dataInfo
.dataVersion
, 8, 4,
1983 "static const int32_t indexes[_NORM_INDEX_TOP]={\n",
1984 indexes
, 32, _NORM_INDEX_TOP
,
1986 usrc_writeUTrieArrays(f
,
1987 "static const uint16_t normTrie_index[%ld]={\n",
1988 "static const uint32_t normTrie_data32[%ld]={\n",
1991 usrc_writeUTrieStruct(f
,
1992 "static const UTrie normTrie={\n",
1993 &normTrie2
, "normTrie_index", "normTrie_data32", "getFoldingNormOffset",
1996 "static const uint16_t extraData[%ld]={\n",
1997 utm_getStart(extraMem
), 16, utm_countItems(extraMem
),
2000 "static const uint16_t combiningTable[%ld]={\n",
2001 combiningTable
, 16, combiningTableTop
,
2004 usrc_writeUTrieArrays(f
,
2005 "static const uint16_t fcdTrie_index[%ld]={\n", NULL
,
2008 usrc_writeUTrieStruct(f
,
2009 "static const UTrie fcdTrie={\n",
2010 &fcdTrie2
, "fcdTrie_index", NULL
, NULL
,
2013 fputs( "static const UTrie fcdTrie={ NULL };\n\n", f
);
2016 usrc_writeUTrieArrays(f
,
2017 "static const uint16_t auxTrie_index[%ld]={\n", NULL
,
2020 usrc_writeUTrieStruct(f
,
2021 "static const UTrie auxTrie={\n",
2022 &auxTrie2
, "auxTrie_index", NULL
, "getFoldingAuxOffset",
2025 fputs( "static const UTrie auxTrie={ NULL };\n\n", f
);
2028 "static const uint16_t canonStartSets[%ld]={\n",
2029 canonStartSets
, 16, canonStartSetsTop
,
2035 /* write the data */
2036 pData
=udata_create(dataDir
, DATA_TYPE
, DATA_NAME
, &dataInfo
,
2037 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, &errorCode
);
2038 if(U_FAILURE(errorCode
)) {
2039 fprintf(stderr
, "gennorm: unable to create the output file, error %d\n", errorCode
);
2043 #if !UCONFIG_NO_NORMALIZATION
2045 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
2046 udata_writeBlock(pData
, normTrieBlock
, normTrieSize
);
2047 udata_writeBlock(pData
, utm_getStart(extraMem
), utm_countItems(extraMem
)*2);
2048 udata_writeBlock(pData
, combiningTable
, combiningTableTop
*2);
2049 udata_writeBlock(pData
, fcdTrieBlock
, fcdTrieSize
);
2050 udata_writeBlock(pData
, auxTrieBlock
, auxTrieSize
);
2051 udata_writeBlock(pData
, canonStartSets
, canonStartSetsTop
*2);
2056 dataLength
=udata_finish(pData
, &errorCode
);
2057 if(U_FAILURE(errorCode
)) {
2058 fprintf(stderr
, "gennorm: error %d writing the output file\n", errorCode
);
2062 if(dataLength
!=size
) {
2063 fprintf(stderr
, "gennorm error: data length %ld != calculated size %ld\n",
2064 (long)dataLength
, (long)size
);
2065 exit(U_INTERNAL_PROGRAM_ERROR
);
2070 #if !UCONFIG_NO_NORMALIZATION
2076 count
=utm_countItems(normMem
);
2077 for(i
=0; i
<count
; ++i
) {
2078 uset_close(norms
[i
].canonStart
);
2082 utm_close(utf32Mem
);
2083 utm_close(extraMem
);
2084 utm_close(combiningTriplesMem
);
2085 utrie_close(normTrie
);
2086 utrie_close(norm32Trie
);
2087 utrie_close(fcdTrie
);
2088 utrie_close(auxTrie
);
2090 uset_close(nfdQCNoSet
);
2092 uprv_free(normTrie
);
2093 uprv_free(norm32Trie
);
2098 #endif /* #if !UCONFIG_NO_NORMALIZATION */
2101 * Hey, Emacs, please set the following:
2104 * indent-tabs-mode: nil