2 *******************************************************************************
4 * Copyright (C) 1999-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2001may25
14 * created by: Markus W. Scherer
16 * Store Unicode normalization data in a memory-mappable file.
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
27 #include "unicode/udata.h"
29 #include "unicode/uset.h"
36 #define DO_DEBUG_OUT 0
38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41 * The new implementation of the normalization code loads its data from
42 * unorm.icu, which is generated with this gennorm tool.
43 * The format of that file is described in unormimp.h .
46 /* file data ---------------------------------------------------------------- */
48 #if UCONFIG_NO_NORMALIZATION
50 /* dummy UDataInfo cf. udata.h */
51 static UDataInfo dataInfo
= {
60 { 0, 0, 0, 0 }, /* dummy dataFormat */
61 { 0, 0, 0, 0 }, /* dummy formatVersion */
62 { 0, 0, 0, 0 } /* dummy dataVersion */
67 /* UDataInfo cf. udata.h */
68 static UDataInfo dataInfo
={
77 { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
78 { 2, 3, UTRIE_SHIFT
, UTRIE_INDEX_SHIFT
}, /* formatVersion */
79 { 3, 2, 0, 0 } /* dataVersion (Unicode version) */
83 setUnicodeVersion(const char *v
) {
85 u_versionFromString(version
, v
);
86 uprv_memcpy(dataInfo
.dataVersion
, version
, 4);
89 static int32_t indexes
[_NORM_INDEX_TOP
]={ 0 };
91 /* builder data ------------------------------------------------------------- */
93 /* modularization flags, see gennorm.h (default to "store everything") */
94 uint32_t gStoreFlags
=0xffffffff;
96 typedef void EnumTrieFn(void *context
, uint32_t code
, Norm
*norm
);
104 static UToolMemory
*normMem
, *utf32Mem
, *extraMem
, *combiningTriplesMem
;
109 * set a flag for each code point that was seen in decompositions -
110 * avoid to decompose ones that have not been used before
112 static uint32_t haveSeenFlags
[256];
114 /* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
115 static USet
*nfdQCNoSet
;
117 /* see addCombiningCP() for details */
118 static uint32_t combiningCPs
[2000];
121 * after processCombining() this contains for each code point in combiningCPs[]
122 * the runtime combining index
124 static uint16_t combiningIndexes
[2000];
126 /* section limits for combiningCPs[], see addCombiningCP() */
127 static uint16_t combineFwdTop
=0, combineBothTop
=0, combineBackTop
=0;
130 * Structure for a triple of code points, stored in combiningTriplesMem.
131 * The lead and trail code points combine into the the combined one,
132 * i.e., there is a canonical decomposition of combined-> <lead, trail>.
134 * Before processCombining() is called, leadIndex and trailIndex are 0.
135 * After processCombining(), they contain the indexes of the lead and trail
136 * code point in the combiningCPs[] array.
137 * They are then sorted by leadIndex, then trailIndex.
138 * They are not sorted by code points.
140 typedef struct CombiningTriple
{
141 uint16_t leadIndex
, trailIndex
;
142 uint32_t lead
, trail
, combined
;
145 /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
146 static uint16_t combiningTable
[0x8000];
147 static uint16_t combiningTableTop
=0;
149 #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
150 static uint16_t canonStartSets
[_NORM_MAX_CANON_SETS
+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
151 +10000]; /* +10000 for exclusion sets */
152 static int32_t canonStartSetsTop
=_NORM_SET_INDEX_TOP
;
153 static int32_t canonSetsCount
=0;
155 /* allocate and initialize a Norm unit */
159 Norm
*p
=(Norm
*)utm_alloc(normMem
);
161 * The combiningIndex must not be initialized to 0 because 0 is the
162 * combiningIndex of the first forward-combining character.
164 p
->combiningIndex
=0xffff;
172 normTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
173 uprv_memset(normTrie
, 0, sizeof(UNewTrie
));
174 norm32Trie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
175 uprv_memset(norm32Trie
, 0, sizeof(UNewTrie
));
176 fcdTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
177 uprv_memset(fcdTrie
, 0, sizeof(UNewTrie
));
178 auxTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
179 uprv_memset(auxTrie
, 0, sizeof(UNewTrie
));
181 /* initialize the two tries */
182 if(NULL
==utrie_open(normTrie
, NULL
, 30000, 0, 0, FALSE
)) {
183 fprintf(stderr
, "error: failed to initialize tries\n");
184 exit(U_MEMORY_ALLOCATION_ERROR
);
187 /* allocate Norm structures and reset the first one */
188 normMem
=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm
));
191 /* allocate UTF-32 string memory */
192 utf32Mem
=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
194 /* reset all "have seen" flags */
195 uprv_memset(haveSeenFlags
, 0, sizeof(haveSeenFlags
));
197 /* open an empty set */
198 nfdQCNoSet
=uset_open(1, 0);
200 /* allocate extra data memory for UTF-16 decomposition strings and other values */
201 extraMem
=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP
, _NORM_EXTRA_INDEX_TOP
, 2);
202 /* initialize the extraMem counter for the top of FNC strings */
203 p16
=(uint16_t *)utm_alloc(extraMem
);
206 /* allocate temporary memory for combining triples */
207 combiningTriplesMem
=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple
));
209 /* set the minimum code points for no/maybe quick check values to the end of the BMP */
210 indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
]=0xffff;
211 indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
]=0xffff;
212 indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]=0xffff;
213 indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
]=0xffff;
215 /* preset the indexes portion of canonStartSets */
216 uprv_memset(canonStartSets
, 0, _NORM_SET_INDEX_TOP
*2);
220 * get or create a Norm unit;
221 * get or create the intermediate trie entries for it as well
224 createNorm(uint32_t code
) {
228 i
=utrie_get32(normTrie
, (UChar32
)code
, NULL
);
234 if(!utrie_set32(normTrie
, (UChar32
)code
, (uint32_t)(p
-norms
))) {
235 fprintf(stderr
, "error: too many normalization entries\n");
236 exit(U_BUFFER_OVERFLOW_ERROR
);
242 /* get an existing Norm unit */
244 getNorm(uint32_t code
) {
247 i
=utrie_get32(normTrie
, (UChar32
)code
, NULL
);
254 /* get the canonical combining class of a character */
256 getCCFromCP(uint32_t code
) {
257 Norm
*norm
=getNorm(code
);
261 return norm
->udataCC
;
266 * enumerate all code points with their Norm structs and call a function for each
267 * return the number of code points with data
270 enumTrie(EnumTrieFn
*fn
, void *context
) {
276 for(code
=0; code
<=0x10ffff;) {
277 i
=utrie_get32(normTrie
, code
, &isInBlockZero
);
279 code
+=UTRIE_DATA_BLOCK_LENGTH
;
282 fn(context
, (uint32_t)code
, norms
+i
);
292 setHaveSeenString(const uint32_t *s
, int32_t length
) {
297 haveSeenFlags
[(c
>>5)&0xff]|=(1<<(c
&0x1f));
302 #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
304 /* handle combining data ---------------------------------------------------- */
307 * Insert an entry into combiningCPs[] for the new code point code with its flags.
308 * The flags indicate if code combines forward, backward, or both.
310 * combiningCPs[] contains three sections:
311 * 1. code points that combine forward
312 * 2. code points that combine forward and backward
313 * 3. code points that combine backward
315 * Search for code in the entire array.
316 * If it is found and already is in the right section (old flags==new flags)
318 * If it is found but the flags are different, then remove it,
319 * union the old and new flags, and reinsert it into its correct section.
320 * If it is not found, then just insert it.
322 * Within each section, the code points are not sorted.
325 addCombiningCP(uint32_t code
, uint8_t flags
) {
329 newEntry
=code
|((uint32_t)flags
<<24);
331 /* search for this code point */
332 for(i
=0; i
<combineBackTop
; ++i
) {
333 if(code
==(combiningCPs
[i
]&0xffffff)) {
335 if(newEntry
==combiningCPs
[i
]) {
336 return; /* no change */
339 /* combine the flags, remove the old entry from the old place, and insert the new one */
340 newEntry
|=combiningCPs
[i
];
341 if(i
!=--combineBackTop
) {
342 uprv_memmove(combiningCPs
+i
, combiningCPs
+i
+1, (combineBackTop
-i
)*4);
344 if(i
<combineBothTop
) {
347 if(i
<combineFwdTop
) {
354 /* not found or modified, insert it */
355 if(combineBackTop
>=sizeof(combiningCPs
)/4) {
356 fprintf(stderr
, "error: gennorm combining code points - trying to use more than %ld units\n",
357 (long)(sizeof(combiningCPs
)/4));
358 exit(U_MEMORY_ALLOCATION_ERROR
);
361 /* set i to the insertion point */
362 flags
=(uint8_t)(newEntry
>>24);
366 } else if(flags
==3) {
368 } else /* flags==2 */ {
372 /* move the following code points up one and insert newEntry at i */
373 if(i
<combineBackTop
) {
374 uprv_memmove(combiningCPs
+i
+1, combiningCPs
+i
, (combineBackTop
-i
)*4);
376 combiningCPs
[i
]=newEntry
;
378 /* finally increment the total counter */
383 * Find the index in combiningCPs[] where code point code is stored.
384 * @param code code point to look for
385 * @param isLead is code a forward combining code point?
386 * @return index in combiningCPs[] where code is stored
389 findCombiningCP(uint32_t code
, UBool isLead
) {
394 limit
=combineBothTop
;
397 limit
=combineBackTop
;
400 /* search for this code point */
401 for(; i
<limit
; ++i
) {
402 if(code
==(combiningCPs
[i
]&0xffffff)) {
413 addCombiningTriple(uint32_t lead
, uint32_t trail
, uint32_t combined
) {
414 CombiningTriple
*triple
;
416 if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION
)) {
421 * set combiningFlags for the two code points
422 * do this after decomposition so that getNorm() above returns NULL
423 * if we do not have actual sub-decomposition data for the initial NFD here
425 createNorm(lead
)->combiningFlags
|=1; /* combines forward */
426 createNorm(trail
)->combiningFlags
|=2; /* combines backward */
428 addCombiningCP(lead
, 1);
429 addCombiningCP(trail
, 2);
431 triple
=(CombiningTriple
*)utm_alloc(combiningTriplesMem
);
434 triple
->combined
=combined
;
438 compareTriples(const void *l
, const void *r
) {
440 diff
=(int)((CombiningTriple
*)l
)->leadIndex
-
441 (int)((CombiningTriple
*)r
)->leadIndex
;
443 diff
=(int)((CombiningTriple
*)l
)->trailIndex
-
444 (int)((CombiningTriple
*)r
)->trailIndex
;
451 CombiningTriple
*triples
;
454 uint16_t i
, j
, count
, tableTop
, finalIndex
, combinesFwd
;
456 triples
=utm_getStart(combiningTriplesMem
);
458 /* add lead and trail indexes to the triples for sorting */
459 count
=(uint16_t)utm_countItems(combiningTriplesMem
);
460 for(i
=0; i
<count
; ++i
) {
461 /* findCombiningCP() must always find the code point */
462 triples
[i
].leadIndex
=findCombiningCP(triples
[i
].lead
, TRUE
);
463 triples
[i
].trailIndex
=findCombiningCP(triples
[i
].trail
, FALSE
);
466 /* sort them by leadIndex, trailIndex */
467 qsort(triples
, count
, sizeof(CombiningTriple
), compareTriples
);
469 /* calculate final combining indexes and store them in the Norm entries */
471 j
=0; /* triples counter */
473 /* first, combining indexes of fwd/both characters are indexes into the combiningTable */
474 for(i
=0; i
<combineBothTop
; ++i
) {
475 /* start a new table */
477 /* assign combining index */
478 createNorm(combiningCPs
[i
]&0xffffff)->combiningIndex
=combiningIndexes
[i
]=tableTop
;
480 /* calculate the length of the combining data for this lead code point in the combiningTable */
481 while(j
<count
&& i
==triples
[j
].leadIndex
) {
482 /* count 2 to 3 16-bit units per composition entry (back-index, code point) */
483 combined
=triples
[j
++].combined
;
484 if(combined
<=0x1fff) {
492 /* second, combining indexes of back-only characters are simply incremented from here to be unique */
494 for(; i
<combineBackTop
; ++i
) {
495 createNorm(combiningCPs
[i
]&0xffffff)->combiningIndex
=combiningIndexes
[i
]=finalIndex
++;
498 /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
499 if(finalIndex
>0x8000) {
500 fprintf(stderr
, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
501 tableTop
, (long)(sizeof(combiningTable
)/4));
502 exit(U_MEMORY_ALLOCATION_ERROR
);
505 combiningTableTop
=tableTop
;
507 /* store the combining data in the combiningTable, with the final indexes from above */
509 j
=0; /* triples counter */
512 * this is essentially the same loop as above, but
513 * it writes the table data instead of calculating and setting the final indexes;
514 * it is necessary to have two passes so that all the final indexes are known before
515 * they are written into the table
517 for(i
=0; i
<combineBothTop
; ++i
) {
518 /* start a new table */
520 combined
=0; /* avoid compiler warning */
522 /* store the combining data for this lead code point in the combiningTable */
523 while(j
<count
&& i
==triples
[j
].leadIndex
) {
525 finalIndex
=combiningIndexes
[triples
[j
].trailIndex
];
526 combined
=triples
[j
++].combined
;
527 normPtr
= getNorm(combined
);
529 if (normPtr
== NULL
) {
530 fprintf(stderr
, "error: processCombining did not get expected result. combined=%d\n", combined
);
531 exit(U_INTERNAL_PROGRAM_ERROR
);
534 /* is combined a starter? (i.e., cc==0 && combines forward) */
535 combinesFwd
=(uint16_t)((normPtr
->combiningFlags
&1)<<13);
538 if(combined
<=0x1fff) {
539 *p
++=(uint16_t)(combinesFwd
|combined
);
540 } else if(combined
<=0xffff) {
541 *p
++=(uint16_t)(0x8000|combinesFwd
);
542 *p
++=(uint16_t)combined
;
544 *p
++=(uint16_t)(0xc000|combinesFwd
|((combined
-0x10000)>>10));
545 *p
++=(uint16_t)(0xdc00|(combined
&0x3ff));
549 /* set a marker on the last final trail index in this lead's table */
550 if(combined
<=0x1fff) {
557 /* post condition: tableTop==(p-combiningTable) */
560 /* processing incoming normalization data ----------------------------------- */
563 * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
564 * c must be a Hangul syllable code point.
567 getHangulDecomposition(uint32_t c
, Norm
*pHangulNorm
, uint32_t hangulBuffer
[3]) {
568 /* Hangul syllable: decompose algorithmically */
572 uprv_memset(pHangulNorm
, 0, sizeof(Norm
));
579 hangulBuffer
[2]=JAMO_T_BASE
+c2
;
586 hangulBuffer
[1]=JAMO_V_BASE
+c%JAMO_V_COUNT
;
587 hangulBuffer
[0]=JAMO_L_BASE
+c
/JAMO_V_COUNT
;
589 pHangulNorm
->nfd
=hangulBuffer
;
590 pHangulNorm
->lenNFD
=length
;
591 if(DO_STORE(UGENNORM_STORE_COMPAT
)) {
592 pHangulNorm
->nfkd
=hangulBuffer
;
593 pHangulNorm
->lenNFKD
=length
;
598 * decompose the one decomposition further, may generate two decompositions
599 * apply all previous characters' decompositions to this one
602 decompStoreNewNF(uint32_t code
, Norm
*norm
) {
603 uint32_t nfd
[40], nfkd
[40], hangulBuffer
[3];
610 uint8_t lenNFD
=0, lenNFKD
=0;
611 UBool changedNFD
=FALSE
, changedNFKD
=FALSE
;
613 if((length
=norm
->lenNFD
)!=0) {
614 /* always allocate the original string */
617 } else if((length
=norm
->lenNFKD
)!=0) {
618 /* always allocate the original string */
622 /* no decomposition here, nothing to do */
626 /* decompose each code point */
627 for(i
=0; i
<length
; ++i
) {
631 if(HANGUL_BASE
<=c
&& c
<(HANGUL_BASE
+HANGUL_COUNT
)) {
632 getHangulDecomposition(c
, &hangulNorm
, hangulBuffer
);
635 /* no data, no decomposition */
642 /* canonically decompose c */
645 uprv_memcpy(nfd
+lenNFD
, p
->nfd
, p
->lenNFD
*4);
652 /* compatibility-decompose c */
654 uprv_memcpy(nfkd
+lenNFKD
, p
->nfkd
, p
->lenNFKD
*4);
657 } else if(p
->lenNFD
!=0) {
658 uprv_memcpy(nfkd
+lenNFKD
, p
->nfd
, p
->lenNFD
*4);
661 * not changedNFKD=TRUE;
662 * so that we do not store a new nfkd if there was no nfkd string before
663 * and we only see canonical decompositions
670 /* assume that norm->lenNFD==1 or ==2 */
671 if(norm
->lenNFD
==2 && !(norm
->combiningFlags
&0x80)) {
672 addCombiningTriple(s32
[0], s32
[1], code
);
677 s32
=utm_allocN(utf32Mem
, lenNFD
);
678 uprv_memcpy(s32
, nfd
, lenNFD
*4);
684 setHaveSeenString(nfd
, lenNFD
);
688 s32
=utm_allocN(utf32Mem
, lenNFKD
);
689 uprv_memcpy(s32
, nfkd
, lenNFKD
*4);
693 norm
->lenNFKD
=lenNFKD
;
695 setHaveSeenString(nfkd
, lenNFKD
);
699 typedef struct DecompSingle
{
705 * apply this one character's decompositions (there is at least one!) to
706 * all previous characters' decompositions to decompose them further
709 decompWithSingleFn(void *context
, uint32_t code
, Norm
*norm
) {
710 uint32_t nfd
[40], nfkd
[40];
712 DecompSingle
*me
=(DecompSingle
*)context
;
715 uint8_t lenNFD
=0, lenNFKD
=0, myLenNFD
, myLenNFKD
;
716 UBool changedNFD
=FALSE
, changedNFKD
=FALSE
;
718 /* get the new character's data */
720 myLenNFD
=me
->norm
->lenNFD
;
721 myLenNFKD
=me
->norm
->lenNFKD
;
722 /* assume that myC has at least one decomposition */
724 if((length
=norm
->lenNFD
)!=0 && myLenNFD
!=0) {
725 /* apply NFD(myC) to norm->nfd */
727 for(i
=0; i
<length
; ++i
) {
730 uprv_memcpy(nfd
+lenNFD
, me
->norm
->nfd
, myLenNFD
*4);
739 if((length
=norm
->lenNFKD
)!=0) {
740 /* apply NFD(myC) and NFKD(myC) to norm->nfkd */
742 for(i
=0; i
<length
; ++i
) {
746 uprv_memcpy(nfkd
+lenNFKD
, me
->norm
->nfkd
, myLenNFKD
*4);
748 } else /* assume myLenNFD!=0 */ {
749 uprv_memcpy(nfkd
+lenNFKD
, me
->norm
->nfd
, myLenNFD
*4);
757 } else if((length
=norm
->lenNFD
)!=0 && myLenNFKD
!=0) {
758 /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
760 for(i
=0; i
<length
; ++i
) {
763 uprv_memcpy(nfkd
+lenNFKD
, me
->norm
->nfkd
, myLenNFKD
*4);
772 /* set the new decompositions, forget the old ones */
775 if(lenNFD
>norm
->lenNFD
) {
776 s32
=utm_allocN(utf32Mem
, lenNFD
);
780 uprv_memcpy(s32
, nfd
, lenNFD
*4);
789 if(lenNFKD
>norm
->lenNFKD
) {
790 s32
=utm_allocN(utf32Mem
, lenNFKD
);
794 uprv_memcpy(s32
, nfkd
, lenNFKD
*4);
798 norm
->lenNFKD
=lenNFKD
;
804 * process the data for one code point listed in UnicodeData;
805 * UnicodeData itself never maps a code point to both NFD and NFKD
808 storeNorm(uint32_t code
, Norm
*norm
) {
809 DecompSingle decompSingle
;
812 if(DO_NOT_STORE(UGENNORM_STORE_COMPAT
)) {
813 /* ignore compatibility decomposition */
817 /* copy existing derived normalization properties */
819 norm
->qcFlags
=p
->qcFlags
;
820 norm
->combiningFlags
=p
->combiningFlags
;
821 norm
->fncIndex
=p
->fncIndex
;
823 /* process the decomposition if there is one here */
824 if((norm
->lenNFD
|norm
->lenNFKD
)!=0) {
825 /* decompose this one decomposition further, may generate two decompositions */
826 decompStoreNewNF(code
, norm
);
828 /* has this code point been used in previous decompositions? */
829 if(HAVE_SEEN(code
)) {
830 /* use this decomposition to decompose other decompositions further */
832 decompSingle
.norm
=norm
;
833 enumTrie(decompWithSingleFn
, &decompSingle
);
838 uprv_memcpy(p
, norm
, sizeof(Norm
));
842 setQCFlags(uint32_t code
, uint8_t qcFlags
) {
843 if(DO_NOT_STORE(UGENNORM_STORE_COMPAT
)) {
844 /* ignore compatibility decomposition: unset the KC/KD flags */
845 qcFlags
&=~(_NORM_QC_NFKC
|_NORM_QC_NFKD
);
847 /* set the KC/KD flags to the same values as the C/D flags */
850 if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION
)) {
851 /* ignore composition data: unset the C/KC flags */
852 qcFlags
&=~(_NORM_QC_NFC
|_NORM_QC_NFKC
);
854 /* set the C/KC flags to the same values as the D/KD flags */
858 createNorm(code
)->qcFlags
|=qcFlags
;
860 /* adjust the minimum code point for quick check no/maybe */
862 if((qcFlags
&_NORM_QC_NFC
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
]) {
863 indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
]=(uint16_t)code
;
865 if((qcFlags
&_NORM_QC_NFKC
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
]) {
866 indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
]=(uint16_t)code
;
868 if((qcFlags
&_NORM_QC_NFD
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]) {
869 indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]=(uint16_t)code
;
871 if((qcFlags
&_NORM_QC_NFKD
) && (uint16_t)code
<indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
]) {
872 indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
]=(uint16_t)code
;
876 if(qcFlags
&_NORM_QC_NFD
) {
877 uset_add(nfdQCNoSet
, (UChar32
)code
);
882 setCompositionExclusion(uint32_t code
) {
883 if(DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
884 createNorm(code
)->combiningFlags
|=0x80;
889 setHangulJamoSpecials() {
894 * Hangul syllables are algorithmically decomposed into Jamos,
895 * and Jamos are algorithmically composed into Hangul syllables.
896 * The quick check flags are parsed, except for Hangul.
899 /* set Jamo L specials */
901 for(c
=0x1100; c
<=0x1112; ++c
) {
903 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_JAMO_L
;
904 if(DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
905 norm
->combiningFlags
=1;
908 /* for each Jamo L create a set with its associated Hangul block */
909 norm
->canonStart
=uset_open(hangul
, hangul
+21*28-1);
913 /* set Jamo V specials */
914 for(c
=0x1161; c
<=0x1175; ++c
) {
916 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_JAMO_V
;
917 if(DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
918 norm
->combiningFlags
=2;
920 norm
->unsafeStart
=TRUE
;
923 /* set Jamo T specials */
924 for(c
=0x11a8; c
<=0x11c2; ++c
) {
926 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_JAMO_T
;
927 if(DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
928 norm
->combiningFlags
=2;
930 norm
->unsafeStart
=TRUE
;
933 /* set Hangul specials, precompacted */
935 norm
->specialTag
=_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_HANGUL
;
936 if(DO_STORE(UGENNORM_STORE_COMPAT
)) {
937 norm
->qcFlags
=_NORM_QC_NFD
|_NORM_QC_NFKD
;
939 norm
->qcFlags
=_NORM_QC_NFD
;
942 if(!utrie_setRange32(normTrie
, 0xac00, 0xd7a4, (uint32_t)(norm
-norms
), TRUE
)) {
943 fprintf(stderr
, "error: too many normalization entries (setting Hangul)\n");
944 exit(U_BUFFER_OVERFLOW_ERROR
);
949 * set FC-NFKC-Closure string
950 * s contains the closure string; s[0]==length, s[1..length] is the actual string
954 setFNC(uint32_t c
, UChar
*s
) {
956 int32_t length
, i
, count
;
959 if( DO_NOT_STORE(UGENNORM_STORE_COMPAT
) ||
960 DO_NOT_STORE(UGENNORM_STORE_COMPOSITION
) ||
961 DO_NOT_STORE(UGENNORM_STORE_AUX
)
966 count
=utm_countItems(extraMem
);
970 /* try to overlay single-unit strings with existing ones */
971 if(length
==1 && first
<0xff00) {
972 p
=utm_getStart(extraMem
);
973 for(i
=1; i
<count
; ++i
) {
982 /* append the new string if it cannot be overlayed with an old one */
984 if(count
>_NORM_AUX_MAX_FNC
) {
985 fprintf(stderr
, "gennorm error: too many FNC strings\n");
986 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
989 /* prepend 0xffxx with xx==length */
990 s
[0]=(uint16_t)(0xff00+length
);
992 p
=(uint16_t *)utm_allocN(extraMem
, length
);
993 uprv_memcpy(p
, s
, length
*2);
995 /* update the top index in extraMem[0] */
997 ((uint16_t *)utm_getStart(extraMem
))[0]=(uint16_t)count
;
1000 /* store the index to the string */
1001 createNorm(c
)->fncIndex
=i
;
1004 /* build runtime structures ------------------------------------------------- */
1006 /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
1008 reorderString(uint32_t *s
, int32_t length
) {
1018 for(i
=0; i
<length
; ++i
) {
1019 /* get the i-th code point and its combining class */
1023 /* it is a combining mark, see if it needs to be moved back */
1028 break; /* found the right place */
1030 /* move the previous code point here and go back */
1037 /* just store the combining class */
1042 return (uint16_t)(((uint16_t)ccs
[0]<<8)|ccs
[length
-1]);
1046 static UBool combineAndQC
[64]={ 0 };
1050 * canonically reorder the up to two decompositions
1051 * and store the leading and trailing combining classes accordingly
1053 * also process canonical decompositions for canonical closure
1056 postParseFn(void *context
, uint32_t code
, Norm
*norm
) {
1059 /* canonically order the NFD */
1060 length
=norm
->lenNFD
;
1062 norm
->canonBothCCs
=reorderString(norm
->nfd
, length
);
1065 /* canonically reorder the NFKD */
1066 length
=norm
->lenNFKD
;
1068 norm
->compatBothCCs
=reorderString(norm
->nfkd
, length
);
1071 /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
1072 if((norm
->lenNFD
!=0) != ((norm
->qcFlags
&_NORM_QC_NFD
)!=0)) {
1073 fprintf(stderr
, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code
, norm
->lenNFD
, norm
->qcFlags
);
1075 if(((norm
->lenNFD
|norm
->lenNFKD
)!=0) != ((norm
->qcFlags
&(_NORM_QC_NFD
|_NORM_QC_NFKD
))!=0)) {
1076 fprintf(stderr
, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code
, norm
->lenNFD
, norm
->lenNFKD
, norm
->qcFlags
);
1079 /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
1081 combineAndQC
[(norm
->qcFlags
&0x33)|((norm
->combiningFlags
&3)<<2)]=1;
1084 if(norm
->combiningFlags
&1) {
1085 if(norm
->udataCC
!=0) {
1086 /* illegal - data-derivable composition exclusion */
1087 fprintf(stderr
, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code
, norm
->udataCC
);
1090 if(norm
->combiningFlags
&2) {
1091 if((norm
->qcFlags
&0x11)==0) {
1092 fprintf(stderr
, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code
);
1095 /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
1096 if(norm
->udataCC
==0) {
1097 printf("U+%04lx combines backward but udataCC==0\n", (long)code
);
1101 if((norm
->combiningFlags
&3)==3 && beVerbose
) {
1102 printf("U+%04lx combines both ways\n", (long)code
);
1106 * process canonical decompositions for canonical closure
1108 * in each canonical decomposition:
1109 * add the current character (code) to the set of canonical starters of its norm->nfd[0]
1110 * set the "unsafe starter" flag for each norm->nfd[1..]
1112 length
=norm
->lenNFD
;
1118 /* nfd[0].canonStart.add(code) */
1120 otherNorm
=createNorm(c
);
1121 if(otherNorm
->canonStart
==NULL
) {
1122 otherNorm
->canonStart
=uset_open(code
, code
);
1123 if(otherNorm
->canonStart
==NULL
) {
1124 fprintf(stderr
, "gennorm error: out of memory in uset_open()\n");
1125 exit(U_MEMORY_ALLOCATION_ERROR
);
1128 uset_add(otherNorm
->canonStart
, code
);
1129 if(!uset_contains(otherNorm
->canonStart
, code
)) {
1130 fprintf(stderr
, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c
, (int)code
);
1131 exit(U_INTERNAL_PROGRAM_ERROR
);
1135 /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
1136 for(i
=1; i
<length
; ++i
) {
1137 createNorm(norm
->nfd
[i
])->unsafeStart
=TRUE
;
1143 make32BitNorm(Norm
*norm
) {
1147 int32_t i
, length
, beforeZero
=0, count
, start
;
1150 * Check for assumptions:
1152 * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
1153 * then the decomposition also begins with a true starter.
1155 if(norm
->udataCC
==0) {
1156 /* this is a starter */
1157 if((norm
->qcFlags
&_NORM_QC_NFC
)==0 && norm
->lenNFD
>0) {
1158 /* a "true" NFC starter with a canonical decomposition */
1159 if( norm
->canonBothCCs
>=0x100 || /* lead cc!=0 or */
1160 ((other
=getNorm(norm
->nfd
[0]))!=NULL
&& (other
->qcFlags
&_NORM_QC_NFC
)!=0) /* nfd[0] not NFC_YES */
1163 "error: true NFC starter canonical decomposition[%u] does not begin\n"
1164 " with a true NFC starter: U+%04lx U+%04lx%s\n",
1165 norm
->lenNFD
, (long)norm
->nfd
[0], (long)norm
->nfd
[1],
1166 norm
->lenNFD
<=2 ? "" : " ...");
1167 exit(U_INVALID_TABLE_FILE
);
1171 if((norm
->qcFlags
&_NORM_QC_NFKC
)==0) {
1172 if(norm
->lenNFKD
>0) {
1173 /* a "true" NFKC starter with a compatibility decomposition */
1174 if( norm
->compatBothCCs
>=0x100 || /* lead cc!=0 or */
1175 ((other
=getNorm(norm
->nfkd
[0]))!=NULL
&& (other
->qcFlags
&_NORM_QC_NFKC
)!=0) /* nfkd[0] not NFKC_YES */
1178 "error: true NFKC starter compatibility decomposition[%u] does not begin\n"
1179 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1180 norm
->lenNFKD
, (long)norm
->nfkd
[0], (long)norm
->nfkd
[1],
1181 norm
->lenNFKD
<=2 ? "" : " ...");
1182 exit(U_INVALID_TABLE_FILE
);
1184 } else if(norm
->lenNFD
>0) {
1185 /* a "true" NFKC starter with only a canonical decomposition */
1186 if( norm
->canonBothCCs
>=0x100 || /* lead cc!=0 or */
1187 ((other
=getNorm(norm
->nfd
[0]))!=NULL
&& (other
->qcFlags
&_NORM_QC_NFKC
)!=0) /* nfd[0] not NFKC_YES */
1190 "error: true NFKC starter canonical decomposition[%u] does not begin\n"
1191 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1192 norm
->lenNFD
, (long)norm
->nfd
[0], (long)norm
->nfd
[1],
1193 norm
->lenNFD
<=2 ? "" : " ...");
1194 exit(U_INVALID_TABLE_FILE
);
1200 /* reset the 32-bit word and set the quick check flags */
1203 /* set the UnicodeData combining class */
1204 word
|=(uint32_t)norm
->udataCC
<<_NORM_CC_SHIFT
;
1206 /* set the combining flag and index */
1207 if(norm
->combiningFlags
&3) {
1208 word
|=(uint32_t)(norm
->combiningFlags
&3)<<6;
1211 /* set the combining index value into the extra data */
1212 /* 0xffff: no combining index; 0..0x7fff: combining index */
1213 if(norm
->combiningIndex
!=0xffff) {
1214 extra
[0]=norm
->combiningIndex
;
1220 /* write the decompositions */
1221 if((norm
->lenNFD
|norm
->lenNFKD
)!=0) {
1222 extra
[count
++]=0; /* set the pieces when available, into extra[beforeZero] */
1224 length
=norm
->lenNFD
;
1226 if(norm
->canonBothCCs
!=0) {
1227 extra
[beforeZero
]|=0x80;
1228 extra
[count
++]=norm
->canonBothCCs
;
1231 for(i
=0; i
<length
; ++i
) {
1232 UTF_APPEND_CHAR_UNSAFE(extra
, count
, norm
->nfd
[i
]);
1234 extra
[beforeZero
]|=(UChar
)(count
-start
); /* set the decomp length as the number of UTF-16 code units */
1237 length
=norm
->lenNFKD
;
1239 if(norm
->compatBothCCs
!=0) {
1240 extra
[beforeZero
]|=0x8000;
1241 extra
[count
++]=norm
->compatBothCCs
;
1244 for(i
=0; i
<length
; ++i
) {
1245 UTF_APPEND_CHAR_UNSAFE(extra
, count
, norm
->nfkd
[i
]);
1247 extra
[beforeZero
]|=(UChar
)((count
-start
)<<8); /* set the decomp length as the number of UTF-16 code units */
1251 /* allocate and copy the extra data */
1255 if(norm
->specialTag
!=0) {
1256 fprintf(stderr
, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm
->specialTag
);
1257 exit(U_ILLEGAL_ARGUMENT_ERROR
);
1260 p
=(UChar
*)utm_allocN(extraMem
, count
);
1261 uprv_memcpy(p
, extra
, count
*2);
1263 /* set the extra index, offset by beforeZero */
1264 word
|=(uint32_t)(beforeZero
+(p
-(UChar
*)utm_getStart(extraMem
)))<<_NORM_EXTRA_SHIFT
;
1265 } else if(norm
->specialTag
!=0) {
1266 /* set a special tag instead of an extra index */
1267 word
|=(uint32_t)norm
->specialTag
<<_NORM_EXTRA_SHIFT
;
1273 /* turn all Norm structs into corresponding 32-bit norm values */
1276 uint32_t *pNormData
;
1278 int32_t i
, normLength
, count
;
1280 count
=(int32_t)utm_countItems(normMem
);
1281 for(i
=0; i
<count
; ++i
) {
1282 norms
[i
].value32
=make32BitNorm(norms
+i
);
1285 pNormData
=utrie_getData(norm32Trie
, &normLength
);
1287 count
=0; /* count is now just used for debugging */
1288 for(i
=0; i
<normLength
; ++i
) {
1290 if(0!=(pNormData
[i
]=norms
[n
].value32
)) {
1297 * extract all Norm.canonBothCCs into the FCD table
1298 * set 32-bit values to use the common fold and compact functions
1304 int32_t i
, count
, fcdLength
;
1307 count
=utm_countItems(normMem
);
1308 for(i
=0; i
<count
; ++i
) {
1309 bothCCs
=norms
[i
].canonBothCCs
;
1311 /* if there are no decomposition cc's then use the udataCC twice */
1312 bothCCs
=norms
[i
].udataCC
;
1313 bothCCs
|=bothCCs
<<8;
1315 norms
[i
].value32
=bothCCs
;
1318 pFCDData
=utrie_getData(fcdTrie
, &fcdLength
);
1320 for(i
=0; i
<fcdLength
; ++i
) {
1322 pFCDData
[i
]=norms
[n
].value32
;
1327 * If the given set contains exactly one character, then return it.
1328 * Otherwise return -1.
1331 usetContainsOne(const USet
* set
) {
1332 if(uset_getItemCount(set
)==1) {
1333 /* there is a single item (a single range) */
1335 UErrorCode ec
=U_ZERO_ERROR
;
1336 int32_t len
=uset_getItem(set
, 0, &start
, &end
, NULL
, 0, &ec
);
1337 if (len
==0 && start
==end
) { /* a range (len==0) with a single code point */
1345 makeCanonSetFn(void *context
, uint32_t code
, Norm
*norm
) {
1346 if(norm
->canonStart
!=NULL
&& !uset_isEmpty(norm
->canonStart
)) {
1348 int32_t c
, tableLength
;
1349 UErrorCode errorCode
=U_ZERO_ERROR
;
1351 /* does the set contain exactly one code point? */
1352 c
=usetContainsOne(norm
->canonStart
);
1354 /* add an entry to the BMP or supplementary search table */
1356 table
=canonStartSets
+_NORM_MAX_CANON_SETS
;
1357 tableLength
=canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1359 table
[tableLength
++]=(uint16_t)code
;
1361 if(c
>=0 && c
<=0xffff && (c
&_NORM_CANON_SET_BMP_MASK
)!=_NORM_CANON_SET_BMP_IS_INDEX
) {
1362 /* single-code point BMP result for BMP code point */
1363 table
[tableLength
++]=(uint16_t)c
;
1365 table
[tableLength
++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX
|canonStartSetsTop
);
1368 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]=(uint16_t)tableLength
;
1370 table
=canonStartSets
+_NORM_MAX_CANON_SETS
+_NORM_MAX_SET_SEARCH_TABLE_LENGTH
;
1371 tableLength
=canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
];
1373 table
[tableLength
++]=(uint16_t)(code
>>16);
1374 table
[tableLength
++]=(uint16_t)code
;
1377 /* single-code point result for supplementary code point */
1378 table
[tableLength
-2]|=(uint16_t)(0x8000|((c
>>8)&0x1f00));
1379 table
[tableLength
++]=(uint16_t)c
;
1381 table
[tableLength
++]=(uint16_t)canonStartSetsTop
;
1383 canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]=(uint16_t)tableLength
;
1387 /* write a USerializedSet */
1390 uset_serialize(norm
->canonStart
,
1391 canonStartSets
+canonStartSetsTop
,
1392 _NORM_MAX_CANON_SETS
-canonStartSetsTop
,
1395 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]=(uint16_t)canonStartSetsTop
;
1397 if(U_FAILURE(errorCode
)) {
1398 fprintf(stderr
, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode
), (int)canonStartSetsTop
);
1401 if(tableLength
>_NORM_MAX_SET_SEARCH_TABLE_LENGTH
) {
1402 fprintf(stderr
, "gennorm error: search table for canonical starter sets too long\n");
1403 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
1408 /* for getSkippableFlags ---------------------------------------------------- */
1410 /* combine the lead and trail code points; return <0 if they do not combine */
1412 combine(uint32_t lead
, uint32_t trail
) {
1413 CombiningTriple
*triples
;
1416 /* search for all triples with c as lead code point */
1417 triples
=utm_getStart(combiningTriplesMem
);
1418 count
=utm_countItems(combiningTriplesMem
);
1420 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1421 for(i
=0; i
<count
&& lead
!=triples
[i
].lead
; ++i
) {}
1423 /* check each triple for this code point */
1424 for(; i
<count
&& lead
==triples
[i
].lead
; ++i
) {
1425 if(trail
==triples
[i
].trail
) {
1426 return (int32_t)triples
[i
].combined
;
1434 * Starting from the canonical decomposition s[0..length[ of a single code point,
1435 * is the code point c consumed in an NFC/FCC recomposition?
1437 * No need to handle discontiguous composition because that would not consume some
1438 * intermediate character, so would not compose back to the original character.
1439 * See comments in canChangeWithFollowing().
1441 * No need to compose beyond where c canonically orders because if it is consumed
1442 * then the result differs from the original anyway.
1444 * Possible optimization:
1445 * - Verify that there are no cases of the same combining mark stacking twice.
1446 * - return FALSE right away if c inserts after a copy of itself
1447 * without attempting to recompose; will happen because each mark in
1448 * the decomposition will be enumerated and passed in as c.
1449 * More complicated and fragile though than it is already.
1454 doesComposeConsume(const uint32_t *s
, int32_t length
, uint32_t c
, uint8_t cc
) {
1457 /* ignore trailing characters where cc<prevCC */
1458 while(length
>1 && cc
<getCCFromCP(s
[length
-1])) {
1462 /* start consuming/combining from the beginning */
1463 starter
=(int32_t)s
[0];
1464 for(i
=1; i
<length
; ++i
) {
1465 starter
=combine((uint32_t)starter
, s
[i
]);
1467 fprintf(stderr
, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
1468 (int)s
[0], (int)s
[1], (int)length
, (int)c
, cc
);
1469 exit(U_INTERNAL_PROGRAM_ERROR
);
1473 /* try to combine/consume c, return TRUE if it is consumed */
1474 return combine((uint32_t)starter
, c
)>=0;
1477 /* does the starter s[0] combine forward with another char that is below trailCC? */
1479 canChangeWithFollowing(const uint32_t *s
, int32_t length
, uint8_t trailCC
) {
1481 /* no character will combine ahead of the trailing char of the decomposition */
1486 * We are only checking skippable condition (f).
1487 * Therefore, the original character does not have quick check flag NFC_NO (c),
1488 * i.e., the decomposition recomposes completely back into the original code point.
1489 * So s[0] must be a true starter with cc==0 and
1490 * combining with following code points.
1492 * Similarly, length==1 is not possible because that would be a singleton
1493 * decomposition which is marked with NFC_NO and does not pass (c).
1495 * Only a character with cc<trailCC can change the composition.
1496 * Reason: A char with cc>=trailCC would order after decomposition s[],
1497 * composition would consume all of the decomposition, and here we know that
1498 * the original char passed check d), i.e., it does not combine forward,
1499 * therefore does not combine with anything after the decomposition is consumed.
1501 * Now see if there is a character that
1502 * 1. combines backward
1504 * 3. is consumed in recomposition
1506 * length==2 is simple:
1508 * Characters that fulfill these conditions are exactly the ones that combine directly
1509 * with the starter c==s[0] because there is no intervening character after
1511 * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
1512 * and see if one has cc<trailCC (passes 2.).
1514 * length>2 is a little harder:
1516 * Since we will get different starters during recomposition, we need to
1517 * enumerate each backward-combining character (1.)
1518 * with cc<trailCC (2.) and
1519 * see if it gets consumed in recomposition. (3.)
1520 * No need to enumerate both-ways combining characters because they must have cc==0.
1523 /* enumerate all chars that combine with this one and check their cc */
1524 CombiningTriple
*triples
;
1525 uint32_t c
, i
, count
;
1528 /* search for all triples with c as lead code point */
1529 triples
=utm_getStart(combiningTriplesMem
);
1530 count
=utm_countItems(combiningTriplesMem
);
1533 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1534 for(i
=0; i
<count
&& c
!=triples
[i
].lead
; ++i
) {}
1536 /* check each triple for this code point */
1537 for(; i
<count
&& c
==triples
[i
].lead
; ++i
) {
1538 cc
=getCCFromCP(triples
[i
].trail
);
1539 if(cc
>0 && cc
<trailCC
) {
1540 /* this trail code point combines with c and has cc<trailCC */
1545 /* enumerate all chars that combine backward */
1550 for(i
=combineBothTop
; i
<combineBackTop
; ++i
) {
1551 c2
=combiningCPs
[i
]&0xffffff;
1553 /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
1554 if(cc
>0 && cc
<trailCC
&& doesComposeConsume(s
, length
-1, c2
, cc
)) {
1560 /* this decomposition is not modified by any appended character */
1564 /* see unormimp.h for details on NF*C Skippable flags */
1566 getSkippableFlags(const Norm
*norm
) {
1567 /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
1569 /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
1570 if(norm
->specialTag
==_NORM_EXTRA_INDEX_TOP
+_NORM_EXTRA_HANGUL
) {
1574 /* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */
1578 * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
1580 * This means that (a)..(e) must always be derived from the runtime norm32 value,
1581 * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
1582 * the form is NF*C and there is a canonical decomposition (NFD_NO).
1584 * (a) unassigned code points get "not skippable"==false because they
1585 * don't have a Norm struct so they won't get here
1588 /* (b) not skippable if cc!=0 */
1589 if(norm
->udataCC
!=0) {
1590 return 0; /* non-zero flag for (f) only */
1594 * not NFC_Skippable if
1595 * (c) quick check flag == NO or
1596 * (d) combines forward or
1597 * (e) combines back or
1598 * (f) can change if another character is added
1601 * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
1602 * check its composition list,
1603 * see if any of the second code points in the list
1604 * has cc less than the trailCC of the decomposition.
1606 * For FCC: Test at runtime if the decomposition has a trailCC>1
1607 * -> there are characters with cc==1, they would order before the trail char
1608 * and prevent contiguous combination with the trail char.
1610 if( (norm
->qcFlags
&(_NORM_QC_NFC
&_NORM_QC_ANY_NO
))!=0 ||
1611 (norm
->combiningFlags
&3)!=0) {
1612 return 0; /* non-zero flag for (f) only */
1614 if(norm
->lenNFD
!=0 && canChangeWithFollowing(norm
->nfd
, norm
->lenNFD
, (uint8_t)norm
->canonBothCCs
)) {
1615 return _NORM_AUX_NFC_SKIP_F_MASK
;
1618 return 0; /* skippable */
1627 pData
=utrie_getData(auxTrie
, &length
);
1629 for(i
=0; i
<length
; ++i
) {
1630 norm
=norms
+pData
[i
];
1632 * 16-bit auxiliary normalization properties
1636 ((uint32_t)(norm
->combiningFlags
&0x80)<<(_NORM_AUX_COMP_EX_SHIFT
-7))|
1637 (uint32_t)norm
->fncIndex
;
1639 if(norm
->unsafeStart
|| norm
->udataCC
!=0) {
1640 pData
[i
]|=_NORM_AUX_UNSAFE_MASK
;
1643 pData
[i
]|=getSkippableFlags(norm
);
1647 /* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
1648 static uint32_t U_CALLCONV
1649 getFoldedNormValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
1650 uint32_t value
, leadNorm32
=0;
1655 while(start
<limit
) {
1656 value
=utrie_get32(trie
, start
, &inBlockZero
);
1658 start
+=UTRIE_DATA_BLOCK_LENGTH
;
1667 /* turn multi-bit fields into the worst-case value */
1668 if(leadNorm32
&_NORM_CC_MASK
) {
1669 leadNorm32
|=_NORM_CC_MASK
;
1672 /* clean up unnecessarily ored bit fields */
1673 leadNorm32
&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT
);
1676 /* nothing to do (only composition exclusions?) */
1680 /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
1682 (uint32_t)_NORM_EXTRA_INDEX_TOP
+
1683 (uint32_t)((offset
-UTRIE_BMP_INDEX_LENGTH
)>>UTRIE_SURROGATE_BLOCK_BITS
)
1684 )<<_NORM_EXTRA_SHIFT
;
1689 /* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */
1692 * folding value for auxiliary data:
1693 * store the non-zero offset in bits 9..0 (FNC bits)
1694 * if there is any non-0 entry;
1695 * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
1697 static uint32_t U_CALLCONV
1698 getFoldedAuxValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
1699 uint32_t value
, oredValues
;
1705 while(start
<limit
) {
1706 value
=utrie_get32(trie
, start
, &inBlockZero
);
1708 start
+=UTRIE_DATA_BLOCK_LENGTH
;
1716 /* move the 10 significant offset bits into bits 9..0 */
1717 offset
>>=UTRIE_SURROGATE_BLOCK_BITS
;
1718 if(offset
>_NORM_AUX_FNC_MASK
) {
1719 fprintf(stderr
, "gennorm error: folding offset too large (auxTrie)\n");
1720 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
1722 return (uint32_t)offset
|(oredValues
&~_NORM_AUX_FNC_MASK
);
1736 /* canonically reorder decompositions and assign combining classes for decompositions */
1737 enumTrie(postParseFn
, NULL
);
1740 for(i
=1; i
<64; ++i
) {
1741 if(combineAndQC
[i
]) {
1742 printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i
&0xc)>>2, i
&0x33);
1747 /* add hangul/jamo specials */
1748 setHangulJamoSpecials();
1750 /* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */
1751 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]=(uint16_t)canonStartSetsTop
;
1753 /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
1754 if(DO_STORE(UGENNORM_STORE_AUX
) && DO_STORE(UGENNORM_STORE_COMPOSITION
)) {
1755 enumTrie(makeCanonSetFn
, NULL
);
1758 /* clone the normalization builder trie to make the final data tries */
1759 if( NULL
==utrie_clone(norm32Trie
, normTrie
, NULL
, 0) ||
1760 NULL
==utrie_clone(fcdTrie
, normTrie
, NULL
, 0) ||
1761 NULL
==utrie_clone(auxTrie
, normTrie
, NULL
, 0)
1763 fprintf(stderr
, "error: unable to clone the normalization trie\n");
1764 exit(U_MEMORY_ALLOCATION_ERROR
);
1767 /* --- finalize data for quick checks & normalization --- */
1769 /* turn the Norm structs (stage2, norms) into 32-bit data words */
1772 /* --- finalize data for FCD checks --- */
1774 /* FCD data: take Norm.canonBothCCs and store them in the FCD table */
1777 /* --- finalize auxiliary normalization data --- */
1782 printf("number of stage 2 entries: %ld\n", stage2Mem
->index
);
1783 printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT
*2+stage2Mem
->index
*4+extraMem
->index
*2);
1785 printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop
, combineBothTop
, combineBackTop
);
1786 printf("combining table count: %u\n", combiningTableTop
);
1790 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1793 generateData(const char *dataDir
, UBool csource
) {
1794 static uint8_t normTrieBlock
[100000], fcdTrieBlock
[100000], auxTrieBlock
[100000];
1796 UNewDataMemory
*pData
;
1797 UErrorCode errorCode
=U_ZERO_ERROR
;
1798 int32_t size
, dataLength
;
1800 #if UCONFIG_NO_NORMALIZATION
1806 U_STRING_DECL(nxCJKCompatPattern
, "[:Ideographic:]", 15);
1807 U_STRING_DECL(nxUnicode32Pattern
, "[:^Age=3.2:]", 12);
1809 int32_t normTrieSize
, fcdTrieSize
, auxTrieSize
;
1811 normTrieSize
=utrie_serialize(norm32Trie
, normTrieBlock
, sizeof(normTrieBlock
), getFoldedNormValue
, FALSE
, &errorCode
);
1812 if(U_FAILURE(errorCode
)) {
1813 fprintf(stderr
, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode
));
1817 if(DO_STORE(UGENNORM_STORE_FCD
)) {
1818 fcdTrieSize
=utrie_serialize(fcdTrie
, fcdTrieBlock
, sizeof(fcdTrieBlock
), NULL
, TRUE
, &errorCode
);
1819 if(U_FAILURE(errorCode
)) {
1820 fprintf(stderr
, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode
));
1827 if(DO_STORE(UGENNORM_STORE_AUX
)) {
1828 auxTrieSize
=utrie_serialize(auxTrie
, auxTrieBlock
, sizeof(auxTrieBlock
), getFoldedAuxValue
, TRUE
, &errorCode
);
1829 if(U_FAILURE(errorCode
)) {
1830 fprintf(stderr
, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode
));
1837 /* move the parts of canonStartSets[] together into a contiguous block */
1838 if( canonStartSetsTop
<_NORM_MAX_CANON_SETS
&&
1839 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]!=0
1841 uprv_memmove(canonStartSets
+canonStartSetsTop
,
1842 canonStartSets
+_NORM_MAX_CANON_SETS
,
1843 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]*2);
1845 canonStartSetsTop
+=canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1847 if( canonStartSetsTop
<(_NORM_MAX_CANON_SETS
+_NORM_MAX_SET_SEARCH_TABLE_LENGTH
) &&
1848 canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]!=0
1850 uprv_memmove(canonStartSets
+canonStartSetsTop
,
1851 canonStartSets
+_NORM_MAX_CANON_SETS
+_NORM_MAX_SET_SEARCH_TABLE_LENGTH
,
1852 canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]*2);
1854 canonStartSetsTop
+=canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
];
1856 /* create the normalization exclusion sets */
1858 * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
1859 * but we cannot use NFD_QC from the pattern because that would require
1860 * unorm.icu which we are just going to generate.
1861 * Therefore we have manually collected nfdQCNoSet and intersect Ideographic
1864 U_STRING_INIT(nxCJKCompatPattern
, "[:Ideographic:]", 15);
1865 U_STRING_INIT(nxUnicode32Pattern
, "[:^Age=3.2:]", 12);
1867 canonStartSets
[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET
]=canonStartSetsTop
;
1868 set
=uset_openPattern(nxCJKCompatPattern
, -1, &errorCode
);
1869 if(U_FAILURE(errorCode
)) {
1870 fprintf(stderr
, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode
));
1873 uset_retainAll(set
, nfdQCNoSet
);
1874 if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS
)) {
1877 canonStartSetsTop
+=uset_serialize(set
, canonStartSets
+canonStartSetsTop
, LENGTHOF(canonStartSets
)-canonStartSetsTop
, &errorCode
);
1878 if(U_FAILURE(errorCode
)) {
1879 fprintf(stderr
, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode
));
1884 canonStartSets
[_NORM_SET_INDEX_NX_UNICODE32_OFFSET
]=canonStartSetsTop
;
1885 set
=uset_openPattern(nxUnicode32Pattern
, -1, &errorCode
);
1886 if(U_FAILURE(errorCode
)) {
1887 fprintf(stderr
, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode
));
1890 if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS
)) {
1893 canonStartSetsTop
+=uset_serialize(set
, canonStartSets
+canonStartSetsTop
, LENGTHOF(canonStartSets
)-canonStartSetsTop
, &errorCode
);
1894 if(U_FAILURE(errorCode
)) {
1895 fprintf(stderr
, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode
));
1900 canonStartSets
[_NORM_SET_INDEX_NX_RESERVED_OFFSET
]=canonStartSetsTop
;
1902 /* make sure that the FCD trie is 4-aligned */
1903 if((utm_countItems(extraMem
)+combiningTableTop
)&1) {
1904 combiningTable
[combiningTableTop
++]=0x1234; /* add one 16-bit word for an even number */
1907 /* pad canonStartSets to 4-alignment, too */
1908 if(canonStartSetsTop
&1) {
1909 canonStartSets
[canonStartSetsTop
++]=0x1235;
1915 utm_countItems(extraMem
)*2+
1916 combiningTableTop
*2+
1919 canonStartSetsTop
*2;
1922 printf("size of normalization trie %5u bytes\n", (int)normTrieSize
);
1923 printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem
));
1924 printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem
))[0]);
1925 printf("size of combining table %5u uint16_t\n", combiningTableTop
);
1926 printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize
);
1927 printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize
);
1928 printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop
);
1929 printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP
);
1930 printf(" size of sets %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]-_NORM_SET_INDEX_TOP
);
1931 printf(" number of sets %5d\n", (int)canonSetsCount
);
1932 printf(" size of BMP search table %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
]);
1933 printf(" size of supplementary search table %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
]);
1934 printf(" length of exclusion sets %5u uint16_t\n", canonStartSets
[_NORM_SET_INDEX_NX_RESERVED_OFFSET
]-canonStartSets
[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET
]);
1935 printf("size of " U_ICUDATA_NAME
"_" DATA_NAME
"." DATA_TYPE
" contents: %ld bytes\n", (long)size
);
1938 indexes
[_NORM_INDEX_TRIE_SIZE
]=normTrieSize
;
1939 indexes
[_NORM_INDEX_UCHAR_COUNT
]=(uint16_t)utm_countItems(extraMem
);
1941 indexes
[_NORM_INDEX_COMBINE_DATA_COUNT
]=combiningTableTop
;
1942 indexes
[_NORM_INDEX_COMBINE_FWD_COUNT
]=combineFwdTop
;
1943 indexes
[_NORM_INDEX_COMBINE_BOTH_COUNT
]=(uint16_t)(combineBothTop
-combineFwdTop
);
1944 indexes
[_NORM_INDEX_COMBINE_BACK_COUNT
]=(uint16_t)(combineBackTop
-combineBothTop
);
1946 /* the quick check minimum code points are already set */
1948 indexes
[_NORM_INDEX_FCD_TRIE_SIZE
]=fcdTrieSize
;
1949 indexes
[_NORM_INDEX_AUX_TRIE_SIZE
]=auxTrieSize
;
1950 indexes
[_NORM_INDEX_CANON_SET_COUNT
]=canonStartSetsTop
;
1955 #if UCONFIG_NO_NORMALIZATION
1956 /* no csource for dummy mode..? */
1957 fprintf(stderr
, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n");
1960 /* write .c file for hardcoded data */
1961 UTrie normTrie2
={ NULL
}, fcdTrie2
={ NULL
}, auxTrie2
={ NULL
};
1964 utrie_unserialize(&normTrie2
, normTrieBlock
, normTrieSize
, &errorCode
);
1966 utrie_unserialize(&fcdTrie2
, fcdTrieBlock
, fcdTrieSize
, &errorCode
);
1969 utrie_unserialize(&auxTrie2
, auxTrieBlock
, auxTrieSize
, &errorCode
);
1971 if(U_FAILURE(errorCode
)) {
1974 "gennorm error: failed to utrie_unserialize() one of the tries - %s\n",
1975 u_errorName(errorCode
));
1979 f
=usrc_create(dataDir
, "unorm_props_data.c");
1982 "static const UVersionInfo formatVersion={ ",
1983 dataInfo
.formatVersion
, 8, 4,
1986 "static const UVersionInfo dataVersion={ ",
1987 dataInfo
.dataVersion
, 8, 4,
1990 "static const int32_t indexes[_NORM_INDEX_TOP]={\n",
1991 indexes
, 32, _NORM_INDEX_TOP
,
1993 usrc_writeUTrieArrays(f
,
1994 "static const uint16_t normTrie_index[%ld]={\n",
1995 "static const uint32_t normTrie_data32[%ld]={\n",
1998 usrc_writeUTrieStruct(f
,
1999 "static const UTrie normTrie={\n",
2000 &normTrie2
, "normTrie_index", "normTrie_data32", "getFoldingNormOffset",
2003 "static const uint16_t extraData[%ld]={\n",
2004 utm_getStart(extraMem
), 16, utm_countItems(extraMem
),
2007 "static const uint16_t combiningTable[%ld]={\n",
2008 combiningTable
, 16, combiningTableTop
,
2011 usrc_writeUTrieArrays(f
,
2012 "static const uint16_t fcdTrie_index[%ld]={\n", NULL
,
2015 usrc_writeUTrieStruct(f
,
2016 "static const UTrie fcdTrie={\n",
2017 &fcdTrie2
, "fcdTrie_index", NULL
, NULL
,
2020 fputs( "static const UTrie fcdTrie={ NULL };\n\n", f
);
2023 usrc_writeUTrieArrays(f
,
2024 "static const uint16_t auxTrie_index[%ld]={\n", NULL
,
2027 usrc_writeUTrieStruct(f
,
2028 "static const UTrie auxTrie={\n",
2029 &auxTrie2
, "auxTrie_index", NULL
, "getFoldingAuxOffset",
2032 fputs( "static const UTrie auxTrie={ NULL };\n\n", f
);
2035 "static const uint16_t canonStartSets[%ld]={\n",
2036 canonStartSets
, 16, canonStartSetsTop
,
2042 /* write the data */
2043 pData
=udata_create(dataDir
, DATA_TYPE
, DATA_NAME
, &dataInfo
,
2044 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, &errorCode
);
2045 if(U_FAILURE(errorCode
)) {
2046 fprintf(stderr
, "gennorm: unable to create the output file, error %d\n", errorCode
);
2050 #if !UCONFIG_NO_NORMALIZATION
2052 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
2053 udata_writeBlock(pData
, normTrieBlock
, normTrieSize
);
2054 udata_writeBlock(pData
, utm_getStart(extraMem
), utm_countItems(extraMem
)*2);
2055 udata_writeBlock(pData
, combiningTable
, combiningTableTop
*2);
2056 udata_writeBlock(pData
, fcdTrieBlock
, fcdTrieSize
);
2057 udata_writeBlock(pData
, auxTrieBlock
, auxTrieSize
);
2058 udata_writeBlock(pData
, canonStartSets
, canonStartSetsTop
*2);
2063 dataLength
=udata_finish(pData
, &errorCode
);
2064 if(U_FAILURE(errorCode
)) {
2065 fprintf(stderr
, "gennorm: error %d writing the output file\n", errorCode
);
2069 if(dataLength
!=size
) {
2070 fprintf(stderr
, "gennorm error: data length %ld != calculated size %ld\n",
2071 (long)dataLength
, (long)size
);
2072 exit(U_INTERNAL_PROGRAM_ERROR
);
2077 #if !UCONFIG_NO_NORMALIZATION
2083 count
=utm_countItems(normMem
);
2084 for(i
=0; i
<count
; ++i
) {
2085 uset_close(norms
[i
].canonStart
);
2089 utm_close(utf32Mem
);
2090 utm_close(extraMem
);
2091 utm_close(combiningTriplesMem
);
2092 utrie_close(normTrie
);
2093 utrie_close(norm32Trie
);
2094 utrie_close(fcdTrie
);
2095 utrie_close(auxTrie
);
2097 uset_close(nfdQCNoSet
);
2099 uprv_free(normTrie
);
2100 uprv_free(norm32Trie
);
2105 #endif /* #if !UCONFIG_NO_NORMALIZATION */
2108 * Hey, Emacs, please set the following:
2111 * indent-tabs-mode: nil