2 *******************************************************************************
4 * Copyright (C) 2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
16 * Store Unicode case mapping properties efficiently for
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/ustring.h"
29 #include "unicode/udata.h"
34 /* Unicode case mapping properties file format ---------------------------------
36 The file format prepared and written here contains several data
37 structures that store indexes or data.
39 Before the data contents described below, there are the headers required by
40 the udata API for loading ICU data. Especially, a UDataInfo structure
41 precedes the actual data. It contains platform properties values and the
44 The following is a description of format version 1 .
46 The file contains the following structures:
48 const int32_t indexes[i0] with values i0, i1, ...:
49 (see UCASE_IX_... constants for names of indexes)
51 i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
52 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
53 i2 trieSize; -- size in bytes of the case mapping properties trie
54 i3 exceptionsLength; -- length in uint16_t of the exceptions array
56 i4..i14 reservedIndexes; -- reserved values; 0 for now
58 i15 maxFullLength; -- maximum length of a full case mapping/folding string
61 Serizalied trie, see utrie.h;
63 const uint16_t exceptions[exceptionsLength];
69 15..4 unsigned exception index
72 15..6 signed delta to simple case mapping code point
73 (add delta to input code point)
75 6 the code point is case-ignorable
76 (U+0307 is also case-ignorable but has an exception)
78 5..4 0 normal character with cc=0
79 1 soft-dotted character
92 A sub-array of the exceptions array is indexed by the exception index in a
94 The sub-array consists of the following fields:
96 uint16_t optional values [];
97 UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
99 excWord: (see UCASE_EXC_...)
101 15 conditional case folding
102 14 conditional special casing
103 13..12 same as non-exception trie data bits 5..4
104 moved here because the exception index needs more bits than the delta
105 0 normal character with cc=0
106 1 soft-dotted character
110 8 if set, then for each optional-value slot there are 2 uint16_t values
111 (high and low parts of 32-bit values)
112 instead of single ones
113 7.. 0 bits for which optional value is present
115 Optional-value slots:
116 0 lowercase mapping (code point)
117 1 case folding (code point)
118 2 uppercase mapping (code point)
119 3 titlecase mapping (code point)
121 7 there is at least one full (string) case mapping
122 the length of each is encoded in a nibble of this optional value,
123 and the strings follow this optional value in the same order:
124 lower/fold/upper/title
126 For space saving, some values are not stored. Lookups are as follows:
127 - If special casing is conditional, then no full lower/upper/title mapping
129 - If case folding is conditional, then no simple or full case foldings are
131 - Fall back in this order:
132 full (string) mapping -- if full mappings are used
133 simple (code point) mapping of the same type
134 simple fold->simple lower
135 simple title->simple upper
136 finally, the original code point (no mapping)
138 ----------------------------------------------------------------------------- */
140 /* UDataInfo cf. udata.h */
141 static UDataInfo dataInfo
={
150 /* dataFormat="cAsE" */
151 { UCASE_FMT_0
, UCASE_FMT_1
, UCASE_FMT_2
, UCASE_FMT_3
},
152 { 1, 0, UTRIE_SHIFT
, UTRIE_INDEX_SHIFT
}, /* formatVersion */
153 { 4, 0, 1, 0 } /* dataVersion */
157 /* maximum number of exceptions expected */
161 /* exceptions values */
162 static uint16_t exceptions
[UCASE_MAX_EXCEPTIONS
+100];
163 static uint16_t exceptionsTop
=0;
164 static Props excProps
[MAX_EXC_COUNT
];
165 static uint16_t exceptionsCount
=0;
167 /* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
168 static int32_t maxFullLength
=U16_MAX_LENGTH
;
170 /* -------------------------------------------------------------------------- */
173 setUnicodeVersion(const char *v
) {
174 UVersionInfo version
;
175 u_versionFromString(version
, v
);
176 uprv_memcpy(dataInfo
.dataVersion
, version
, 4);
179 /* store a character's properties ------------------------------------------- */
183 UErrorCode errorCode
;
184 uint32_t value
, oldValue
;
187 /* get the non-UnicodeData.txt properties */
188 value
=oldValue
=upvec_getValue(pv
, p
->code
, 0);
190 /* default: map to self */
193 if(p
->gc
==U_TITLECASE_LETTER
) {
194 /* the Titlecase property is read late, from UnicodeData.txt */
198 if(p
->upperCase
!=0) {
199 /* uppercase mapping as delta if the character is lowercase */
200 if((value
&UCASE_TYPE_MASK
)==UCASE_LOWER
) {
201 delta
=p
->upperCase
-p
->code
;
203 value
|=UCASE_EXCEPTION
;
206 if(p
->lowerCase
!=0) {
207 /* lowercase mapping as delta if the character is uppercase or titlecase */
208 if((value
&UCASE_TYPE_MASK
)>=UCASE_UPPER
) {
209 delta
=p
->lowerCase
-p
->code
;
211 value
|=UCASE_EXCEPTION
;
214 if(p
->upperCase
!=p
->titleCase
) {
215 value
|=UCASE_EXCEPTION
;
217 if(p
->specialCasing
!=NULL
) {
218 value
|=UCASE_EXCEPTION
;
220 if(p
->caseFolding
!=NULL
) {
221 value
|=UCASE_EXCEPTION
;
224 if(delta
<UCASE_MIN_DELTA
|| UCASE_MAX_DELTA
<delta
) {
225 value
|=UCASE_EXCEPTION
;
229 if(value
&UCASE_DOT_MASK
) {
230 fprintf(stderr
, "gencase: a soft-dotted character has cc!=0\n");
231 exit(U_INTERNAL_PROGRAM_ERROR
);
236 value
|=UCASE_OTHER_ACCENT
;
240 /* encode case-ignorable as delta==1 on uncased characters */
242 (value
&UCASE_TYPE_MASK
)==UCASE_NONE
&&
244 ((U_MASK(p
->gc
)&(U_GC_MN_MASK
|U_GC_ME_MASK
|U_GC_CF_MASK
|U_GC_LM_MASK
|U_GC_SK_MASK
))!=0 ||
245 p
->code
==0x27 || p
->code
==0xad || p
->code
==0x2019)
248 * We use one of the delta/exception bits, which works because we only
249 * store the case-ignorable flag for uncased characters.
250 * There is no delta for uncased characters (see checks above).
251 * If there is an exception for an uncased, case-ignorable character
252 * (although there should not be any case mappings if it's uncased)
253 * then we have a problem.
254 * There is one character which is case-ignorable but has an exception:
255 * U+0307 is uncased, Mn, has conditional special casing and
256 * is therefore handled in code instead.
258 if(value
&UCASE_EXCEPTION
) {
259 fprintf(stderr
, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
260 (unsigned long)p
->code
);
261 exit(U_INTERNAL_PROGRAM_ERROR
);
267 /* handle exceptions */
268 if(value
&UCASE_EXCEPTION
) {
269 /* simply store exceptions for later processing and encoding */
270 value
|=(uint32_t)exceptionsCount
<<UGENCASE_EXC_SHIFT
;
271 uprv_memcpy(excProps
+exceptionsCount
, p
, sizeof(*p
));
272 if(++exceptionsCount
==MAX_EXC_COUNT
) {
273 fprintf(stderr
, "gencase: too many exceptions\n");
274 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
277 /* store the simple case mapping delta */
278 value
|=((uint32_t)delta
<<UCASE_DELTA_SHIFT
)&UCASE_DELTA_MASK
;
281 errorCode
=U_ZERO_ERROR
;
282 if( value
!=oldValue
&&
283 !upvec_setValue(pv
, p
->code
, p
->code
+1, 0, value
, 0xffffffff, &errorCode
)
285 fprintf(stderr
, "gencase error: unable to set case mapping values, code: %s\n",
286 u_errorName(errorCode
));
292 addCaseSensitive(UChar32 first
, UChar32 last
) {
293 UErrorCode errorCode
=U_ZERO_ERROR
;
294 if(!upvec_setValue(pv
, first
, last
+1, 0, UCASE_SENSITIVE
, UCASE_SENSITIVE
, &errorCode
)) {
295 fprintf(stderr
, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
296 u_errorName(errorCode
));
306 /* exceptions --------------------------------------------------------------- */
309 fullMappingEqualsSimple(const UChar
*s
, UChar32 simple
, UChar32 c
) {
314 if(length
==0 || length
>U16_MAX_LENGTH
) {
318 U16_NEXT(s
, i
, length
, full
);
321 simple
=c
; /* UCD has no simple mapping if it's the same as the code point itself */
323 return (UBool
)(i
==length
&& full
==simple
);
327 makeException(uint32_t value
, Props
*p
) {
330 uint16_t excWord
, excIndex
, excTop
, i
, count
, length
, fullLengths
;
333 /* excIndex will be returned for storing in the trie word */
334 excIndex
=exceptionsTop
;
335 if(excIndex
>=UCASE_MAX_EXCEPTIONS
) {
336 fprintf(stderr
, "gencase error: too many exceptions words\n");
337 exit(U_BUFFER_OVERFLOW_ERROR
);
340 excTop
=excIndex
+1; /* +1 for excWord which will be stored at excIndex */
342 /* copy and shift the soft-dotted bits */
343 excWord
=((uint16_t)value
&UCASE_DOT_MASK
)<<UCASE_EXC_DOT_SHIFT
;
345 /* update maxFullLength */
346 if(p
->specialCasing
!=NULL
) {
347 length
=p
->specialCasing
->lowerCase
[0];
348 if(length
>maxFullLength
) {
349 maxFullLength
=length
;
351 length
=p
->specialCasing
->upperCase
[0];
352 if(length
>maxFullLength
) {
353 maxFullLength
=length
;
355 length
=p
->specialCasing
->titleCase
[0];
356 if(length
>maxFullLength
) {
357 maxFullLength
=length
;
360 if(p
->caseFolding
!=NULL
) {
361 length
=p
->caseFolding
->full
[0];
362 if(length
>maxFullLength
) {
363 maxFullLength
=length
;
367 /* set the bits for conditional mappings */
368 if(p
->specialCasing
!=NULL
&& p
->specialCasing
->isComplex
) {
369 excWord
|=UCASE_EXC_CONDITIONAL_SPECIAL
;
370 p
->specialCasing
=NULL
;
372 if(p
->caseFolding
!=NULL
&& p
->caseFolding
->simple
==0 && p
->caseFolding
->full
[0]==0) {
373 excWord
|=UCASE_EXC_CONDITIONAL_FOLD
;
379 * UCD stores no simple mappings when they are the same as the code point itself.
380 * SpecialCasing and CaseFolding do store simple mappings even if they are
381 * the same as the code point itself.
382 * Comparisons between simple regular mappings and simple special/folding
383 * mappings need to compensate for the difference by comparing with the
384 * original code point if a simple UCD mapping is missing (0).
387 /* remove redundant data */
388 if(p
->specialCasing
!=NULL
) {
389 /* do not store full mappings if they are the same as the simple ones */
390 if(fullMappingEqualsSimple(p
->specialCasing
->lowerCase
, p
->lowerCase
, p
->code
)) {
391 p
->specialCasing
->lowerCase
[0]=0;
393 if(fullMappingEqualsSimple(p
->specialCasing
->upperCase
, p
->upperCase
, p
->code
)) {
394 p
->specialCasing
->upperCase
[0]=0;
396 if(fullMappingEqualsSimple(p
->specialCasing
->titleCase
, p
->titleCase
, p
->code
)) {
397 p
->specialCasing
->titleCase
[0]=0;
400 if( p
->caseFolding
!=NULL
&&
401 fullMappingEqualsSimple(p
->caseFolding
->full
, p
->caseFolding
->simple
, p
->code
)
403 p
->caseFolding
->full
[0]=0;
406 /* write the optional slots */
410 if(p
->lowerCase
!=0) {
411 slots
[count
]=(uint32_t)p
->lowerCase
;
412 slotBits
|=slots
[count
];
414 excWord
|=U_MASK(UCASE_EXC_LOWER
);
416 if( p
->caseFolding
!=NULL
&&
417 p
->caseFolding
->simple
!=0 &&
419 p
->caseFolding
->simple
!=p
->lowerCase
:
420 p
->caseFolding
->simple
!=p
->code
)
422 slots
[count
]=(uint32_t)p
->caseFolding
->simple
;
423 slotBits
|=slots
[count
];
425 excWord
|=U_MASK(UCASE_EXC_FOLD
);
427 if(p
->upperCase
!=0) {
428 slots
[count
]=(uint32_t)p
->upperCase
;
429 slotBits
|=slots
[count
];
431 excWord
|=U_MASK(UCASE_EXC_UPPER
);
433 if(p
->upperCase
!=p
->titleCase
) {
434 if(p
->titleCase
!=0) {
435 slots
[count
]=(uint32_t)p
->titleCase
;
437 slots
[count
]=(uint32_t)p
->code
;
439 slotBits
|=slots
[count
];
441 excWord
|=U_MASK(UCASE_EXC_TITLE
);
444 /* lengths of full case mapping strings, stored in the last slot */
446 if(p
->specialCasing
!=NULL
) {
447 fullLengths
=p
->specialCasing
->lowerCase
[0];
448 fullLengths
|=p
->specialCasing
->upperCase
[0]<<8;
449 fullLengths
|=p
->specialCasing
->titleCase
[0]<<12;
451 if(p
->caseFolding
!=NULL
) {
452 fullLengths
|=p
->caseFolding
->full
[0]<<4;
455 slots
[count
]=fullLengths
;
456 slotBits
|=slots
[count
];
458 excWord
|=U_MASK(UCASE_EXC_FULL_MAPPINGS
);
462 doubleSlots
=(UBool
)(slotBits
>0xffff);
464 for(i
=0; i
<count
; ++i
) {
465 exceptions
[excTop
++]=(uint16_t)slots
[i
];
468 excWord
|=UCASE_EXC_DOUBLE_SLOTS
;
469 for(i
=0; i
<count
; ++i
) {
470 exceptions
[excTop
++]=(uint16_t)(slots
[i
]>>16);
471 exceptions
[excTop
++]=(uint16_t)slots
[i
];
475 /* write the full case mapping strings */
476 if(p
->specialCasing
!=NULL
) {
477 length
=(uint16_t)p
->specialCasing
->lowerCase
[0];
478 u_memcpy((UChar
*)exceptions
+excTop
, p
->specialCasing
->lowerCase
+1, length
);
481 if(p
->caseFolding
!=NULL
) {
482 length
=(uint16_t)p
->caseFolding
->full
[0];
483 u_memcpy((UChar
*)exceptions
+excTop
, p
->caseFolding
->full
+1, length
);
486 if(p
->specialCasing
!=NULL
) {
487 length
=(uint16_t)p
->specialCasing
->upperCase
[0];
488 u_memcpy((UChar
*)exceptions
+excTop
, p
->specialCasing
->upperCase
+1, length
);
491 length
=(uint16_t)p
->specialCasing
->titleCase
[0];
492 u_memcpy((UChar
*)exceptions
+excTop
, p
->specialCasing
->titleCase
+1, length
);
496 exceptionsTop
=excTop
;
498 /* write the main exceptions word */
499 exceptions
[excIndex
]=excWord
;
512 while((row
=upvec_getRow(pv
, i
, NULL
, NULL
))!=NULL
) {
514 if(value
&UCASE_EXCEPTION
) {
515 excIndex
=makeException(value
, excProps
+(value
>>UGENCASE_EXC_SHIFT
));
516 *row
=(value
&~(UGENCASE_EXC_MASK
|UCASE_EXC_MASK
))|(excIndex
<<UCASE_EXC_SHIFT
);
522 /* generate output data ----------------------------------------------------- */
525 generateData(const char *dataDir
) {
526 static int32_t indexes
[UCASE_IX_TOP
]={
529 static uint8_t trieBlock
[40000];
532 UChar32 start
, limit
;
535 UNewDataMemory
*pData
;
537 UErrorCode errorCode
=U_ZERO_ERROR
;
541 pTrie
=utrie_open(NULL
, NULL
, 20000, 0, 0, TRUE
);
543 fprintf(stderr
, "gencase error: unable to create a UNewTrie\n");
544 exit(U_MEMORY_ALLOCATION_ERROR
);
547 for(i
=0; (row
=upvec_getRow(pv
, i
, &start
, &limit
))!=NULL
; ++i
) {
548 if(!utrie_setRange32(pTrie
, start
, limit
, *row
, TRUE
)) {
549 fprintf(stderr
, "gencase error: unable to set trie value (overflow)\n");
550 exit(U_BUFFER_OVERFLOW_ERROR
);
554 trieSize
=utrie_serialize(pTrie
, trieBlock
, sizeof(trieBlock
), NULL
, TRUE
, &errorCode
);
555 if(U_FAILURE(errorCode
)) {
556 fprintf(stderr
, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode
), (long)trieSize
);
560 indexes
[UCASE_IX_EXC_LENGTH
]=exceptionsTop
;
561 indexes
[UCASE_IX_TRIE_SIZE
]=trieSize
;
562 indexes
[UCASE_IX_LENGTH
]=(int32_t)sizeof(indexes
)+trieSize
+2*exceptionsTop
;
564 indexes
[UCASE_IX_MAX_FULL_LENGTH
]=maxFullLength
;
567 printf("trie size in bytes: %5d\n", (int)trieSize
);
568 printf("number of code points with exceptions: %5d\n", exceptionsCount
);
569 printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop
);
570 printf("data size: %5d\n", (int)indexes
[UCASE_IX_LENGTH
]);
574 pData
=udata_create(dataDir
, UCASE_DATA_TYPE
, UCASE_DATA_NAME
, &dataInfo
,
575 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, &errorCode
);
576 if(U_FAILURE(errorCode
)) {
577 fprintf(stderr
, "gencase: unable to create data memory, %s\n", u_errorName(errorCode
));
581 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
582 udata_writeBlock(pData
, trieBlock
, trieSize
);
583 udata_writeBlock(pData
, exceptions
, 2*exceptionsTop
);
586 dataLength
=udata_finish(pData
, &errorCode
);
587 if(U_FAILURE(errorCode
)) {
588 fprintf(stderr
, "gencase: error %d writing the output file\n", errorCode
);
592 if(dataLength
!=indexes
[UCASE_IX_LENGTH
]) {
593 fprintf(stderr
, "gencase: data length %ld != calculated size %d\n",
594 dataLength
, (int)indexes
[UCASE_IX_LENGTH
]);
595 exit(U_INTERNAL_PROGRAM_ERROR
);
602 * Hey, Emacs, please set the following:
605 * indent-tabs-mode: nil