2 *******************************************************************************
4 * Copyright (C) 2004-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
16 * Store Unicode case mapping properties efficiently for
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/ustring.h"
30 #include "unicode/udata.h"
36 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
38 /* Unicode case mapping properties file format ---------------------------------
40 The file format prepared and written here contains several data
41 structures that store indexes or data.
43 Before the data contents described below, there are the headers required by
44 the udata API for loading ICU data. Especially, a UDataInfo structure
45 precedes the actual data. It contains platform properties values and the
48 The following is a description of format version 1.1 .
50 Format version 1.1 adds data for case closure.
52 The file contains the following structures:
54 const int32_t indexes[i0] with values i0, i1, ...:
55 (see UCASE_IX_... constants for names of indexes)
57 i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
58 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
59 i2 trieSize; -- size in bytes of the case mapping properties trie
60 i3 exceptionsLength; -- length in uint16_t of the exceptions array
61 i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
63 i5..i14 reservedIndexes; -- reserved values; 0 for now
65 i15 maxFullLength; -- maximum length of a full case mapping/folding string
68 Serialized trie, see utrie.h;
70 const uint16_t exceptions[exceptionsLength];
72 const UChar unfold[unfoldLength];
78 15..4 unsigned exception index
81 15..6 signed delta to simple case mapping code point
82 (add delta to input code point)
84 6 the code point is case-ignorable
85 (U+0307 is also case-ignorable but has an exception)
87 5..4 0 normal character with cc=0
88 1 soft-dotted character
101 A sub-array of the exceptions array is indexed by the exception index in a
103 The sub-array consists of the following fields:
105 uint16_t optional values [];
106 UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
108 excWord: (see UCASE_EXC_...)
110 15 conditional case folding
111 14 conditional special casing
112 13..12 same as non-exception trie data bits 5..4
113 moved here because the exception index needs more bits than the delta
114 0 normal character with cc=0
115 1 soft-dotted character
119 8 if set, then for each optional-value slot there are 2 uint16_t values
120 (high and low parts of 32-bit values)
121 instead of single ones
122 7.. 0 bits for which optional value is present
124 Optional-value slots:
125 0 lowercase mapping (code point)
126 1 case folding (code point)
127 2 uppercase mapping (code point)
128 3 titlecase mapping (code point)
131 6 closure mappings (new in format version 1.1)
132 7 there is at least one full (string) case mapping
133 the length of each is encoded in a nibble of this optional value,
134 and the strings follow this optional value in the same order:
135 lower/fold/upper/title
137 The optional closure mappings value is used as follows:
138 Bits 0..3 contain the length of a string of code points for case closure.
139 The string immediately follows the full case mappings, or the closure value
140 slot if there are no full case mappings.
141 Bits 4..15 are reserved and could be used in the future to indicate the
142 number of strings for case closure.
143 Complete case closure for a code point is given by the union of all simple
144 and full case mappings and foldings, plus the case closure code points
145 (and potentially, in the future, case closure strings).
147 For space saving, some values are not stored. Lookups are as follows:
148 - If special casing is conditional, then no full lower/upper/title mapping
150 - If case folding is conditional, then no simple or full case foldings are
152 - Fall back in this order:
153 full (string) mapping -- if full mappings are used
154 simple (code point) mapping of the same type
155 simple fold->simple lower
156 simple title->simple upper
157 finally, the original code point (no mapping)
159 This fallback order is strict:
160 In particular, the fallback from full case folding is to simple case folding,
161 not to full lowercase mapping.
163 Reverse case folding data ("unfold") array: (new in format version 1.1)
165 This array stores some miscellaneous values followed by a table. The data maps
166 back from multi-character strings to their original code points, for use
169 The table contains two columns of strings.
170 The string in the first column is the case folding of each of the code points
171 in the second column. The strings are terminated with NUL or by the end of the
172 column, whichever comes first.
174 The miscellaneous data takes up one pseudo-row and includes:
176 - number of UChars per row
177 - number of UChars in the left (folding string) column
179 The table is sorted by its first column. Values in the first column are unique.
181 ----------------------------------------------------------------------------- */
183 /* UDataInfo cf. udata.h */
184 static UDataInfo dataInfo
={
193 /* dataFormat="cAsE" */
194 { UCASE_FMT_0
, UCASE_FMT_1
, UCASE_FMT_2
, UCASE_FMT_3
},
195 { 1, 1, UTRIE_SHIFT
, UTRIE_INDEX_SHIFT
}, /* formatVersion */
196 { 4, 0, 1, 0 } /* dataVersion */
200 /* maximum number of exceptions expected */
204 /* exceptions values */
205 static uint16_t exceptions
[UCASE_MAX_EXCEPTIONS
+100];
206 static uint16_t exceptionsTop
=0;
207 static Props excProps
[MAX_EXC_COUNT
];
208 static uint16_t exceptionsCount
=0;
210 /* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
211 static int32_t maxFullLength
=U16_MAX_LENGTH
;
213 /* reverse case folding ("unfold") data */
214 static UChar unfold
[UGENCASE_UNFOLD_MAX_ROWS
*UGENCASE_UNFOLD_WIDTH
]={
215 0, UGENCASE_UNFOLD_WIDTH
, UGENCASE_UNFOLD_STRING_WIDTH
, 0, 0
217 static uint16_t unfoldRows
=0;
218 static uint16_t unfoldTop
=UGENCASE_UNFOLD_WIDTH
;
220 /* Unicode versions --------------------------------------------------------- */
222 static const UVersionInfo
235 int32_t ucdVersion
=UNI_4_1
;
238 findUnicodeVersion(const UVersionInfo version
) {
241 for(i
=0; /* while(version>unicodeVersions[i]) {} */
242 i
<UNI_VER_COUNT
&& uprv_memcmp(version
, unicodeVersions
[i
], 4)>0;
244 if(0<i
&& i
<UNI_VER_COUNT
&& uprv_memcmp(version
, unicodeVersions
[i
], 4)<0) {
245 --i
; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
247 return i
; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
251 setUnicodeVersion(const char *v
) {
252 UVersionInfo version
;
253 u_versionFromString(version
, v
);
254 uprv_memcpy(dataInfo
.dataVersion
, version
, 4);
255 ucdVersion
=findUnicodeVersion(version
);
258 /* -------------------------------------------------------------------------- */
261 addUnfolding(UChar32 c
, const UChar
*s
, int32_t length
) {
264 if(length
>UGENCASE_UNFOLD_STRING_WIDTH
) {
265 fprintf(stderr
, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
266 (long)length
, UGENCASE_UNFOLD_STRING_WIDTH
);
267 exit(U_INTERNAL_PROGRAM_ERROR
);
269 if(unfoldTop
>= (LENGTHOF(unfold
) - UGENCASE_UNFOLD_STRING_WIDTH
)) {
270 fprintf(stderr
, "gencase error: too many multi-character case foldings\n");
271 exit(U_BUFFER_OVERFLOW_ERROR
);
273 u_memset(unfold
+unfoldTop
, 0, UGENCASE_UNFOLD_WIDTH
);
274 u_memcpy(unfold
+unfoldTop
, s
, length
);
276 i
=unfoldTop
+UGENCASE_UNFOLD_STRING_WIDTH
;
277 U16_APPEND_UNSAFE(unfold
, i
, c
);
280 unfoldTop
+=UGENCASE_UNFOLD_WIDTH
;
283 /* store a character's properties ------------------------------------------- */
287 UErrorCode errorCode
;
288 uint32_t value
, oldValue
;
290 UBool isCaseIgnorable
;
292 /* get the non-UnicodeData.txt properties */
293 value
=oldValue
=upvec_getValue(pv
, p
->code
, 0);
295 /* default: map to self */
298 if(p
->gc
==U_TITLECASE_LETTER
) {
299 /* the Titlecase property is read late, from UnicodeData.txt */
303 if(p
->upperCase
!=0) {
304 /* uppercase mapping as delta if the character is lowercase */
305 if((value
&UCASE_TYPE_MASK
)==UCASE_LOWER
) {
306 delta
=p
->upperCase
-p
->code
;
308 value
|=UCASE_EXCEPTION
;
311 if(p
->lowerCase
!=0) {
312 /* lowercase mapping as delta if the character is uppercase or titlecase */
313 if((value
&UCASE_TYPE_MASK
)>=UCASE_UPPER
) {
314 delta
=p
->lowerCase
-p
->code
;
316 value
|=UCASE_EXCEPTION
;
319 if(p
->upperCase
!=p
->titleCase
) {
320 value
|=UCASE_EXCEPTION
;
322 if(p
->closure
[0]!=0) {
323 value
|=UCASE_EXCEPTION
;
325 if(p
->specialCasing
!=NULL
) {
326 value
|=UCASE_EXCEPTION
;
328 if(p
->caseFolding
!=NULL
) {
329 value
|=UCASE_EXCEPTION
;
332 if(delta
<UCASE_MIN_DELTA
|| UCASE_MAX_DELTA
<delta
) {
333 value
|=UCASE_EXCEPTION
;
337 if(value
&UCASE_DOT_MASK
) {
338 fprintf(stderr
, "gencase: a soft-dotted character has cc!=0\n");
339 exit(U_INTERNAL_PROGRAM_ERROR
);
344 value
|=UCASE_OTHER_ACCENT
;
348 /* encode case-ignorable as delta==1 on uncased characters */
349 isCaseIgnorable
=FALSE
;
350 if((value
&UCASE_TYPE_MASK
)==UCASE_NONE
) {
351 if(ucdVersion
>=UNI_4_1
) {
353 * Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk
354 * Unicode 5.1 and up: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk
355 * The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet.
358 (U_MASK(p
->gc
)&(U_GC_MN_MASK
|U_GC_ME_MASK
|U_GC_CF_MASK
|U_GC_LM_MASK
|U_GC_SK_MASK
))!=0 ||
359 (upvec_getValue(pv
, p
->code
, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT
))!=0
361 isCaseIgnorable
=TRUE
;
364 /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */
366 (U_MASK(p
->gc
)&(U_GC_MN_MASK
|U_GC_ME_MASK
|U_GC_CF_MASK
|U_GC_LM_MASK
|U_GC_SK_MASK
))!=0 ||
367 p
->code
==0x27 || p
->code
==0xad || p
->code
==0x2019
369 isCaseIgnorable
=TRUE
;
374 if(isCaseIgnorable
&& p
->code
!=0x307) {
376 * We use one of the delta/exception bits, which works because we only
377 * store the case-ignorable flag for uncased characters.
378 * There is no delta for uncased characters (see checks above).
379 * If there is an exception for an uncased, case-ignorable character
380 * (although there should not be any case mappings if it's uncased)
381 * then we have a problem.
382 * There is one character which is case-ignorable but has an exception:
383 * U+0307 is uncased, Mn, has conditional special casing and
384 * is therefore handled in code instead.
386 if(value
&UCASE_EXCEPTION
) {
387 fprintf(stderr
, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
388 (unsigned long)p
->code
);
389 exit(U_INTERNAL_PROGRAM_ERROR
);
395 /* handle exceptions */
396 if(value
&UCASE_EXCEPTION
) {
397 /* simply store exceptions for later processing and encoding */
398 value
|=(uint32_t)exceptionsCount
<<UGENCASE_EXC_SHIFT
;
399 uprv_memcpy(excProps
+exceptionsCount
, p
, sizeof(*p
));
400 if(++exceptionsCount
==MAX_EXC_COUNT
) {
401 fprintf(stderr
, "gencase: too many exceptions\n");
402 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
405 /* store the simple case mapping delta */
406 value
|=((uint32_t)delta
<<UCASE_DELTA_SHIFT
)&UCASE_DELTA_MASK
;
409 errorCode
=U_ZERO_ERROR
;
410 if( value
!=oldValue
&&
411 !upvec_setValue(pv
, p
->code
, p
->code
+1, 0, value
, 0xffffffff, &errorCode
)
413 fprintf(stderr
, "gencase error: unable to set case mapping values, code: %s\n",
414 u_errorName(errorCode
));
418 /* add the multi-character case folding to the "unfold" data */
419 if(p
->caseFolding
!=NULL
) {
420 int32_t length
=p
->caseFolding
->full
[0];
421 if(length
>1 && u_strHasMoreChar32Than(p
->caseFolding
->full
+1, length
, 1)) {
422 addUnfolding(p
->code
, p
->caseFolding
->full
+1, length
);
428 addCaseSensitive(UChar32 first
, UChar32 last
) {
429 UErrorCode errorCode
=U_ZERO_ERROR
;
430 if(!upvec_setValue(pv
, first
, last
+1, 0, UCASE_SENSITIVE
, UCASE_SENSITIVE
, &errorCode
)) {
431 fprintf(stderr
, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
432 u_errorName(errorCode
));
437 /* finalize reverse case folding ("unfold") data ---------------------------- */
439 static int32_t U_CALLCONV
440 compareUnfold(const void *context
, const void *left
, const void *right
) {
441 return u_memcmp((const UChar
*)left
, (const UChar
*)right
, UGENCASE_UNFOLD_WIDTH
);
447 iDot
[2]= { 0x69, 0x307 };
451 UErrorCode errorCode
;
454 * add a case folding that we missed because it's conditional:
455 * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
457 addUnfolding(0x130, iDot
, 2);
460 errorCode
=U_ZERO_ERROR
;
461 uprv_sortArray(unfold
+UGENCASE_UNFOLD_WIDTH
, unfoldRows
, UGENCASE_UNFOLD_WIDTH
*2,
462 compareUnfold
, NULL
, FALSE
, &errorCode
);
464 /* make unique-string rows by merging adjacent ones' code point columns */
466 /* make p point to row i-1 */
467 p
=(UChar
*)unfold
+UGENCASE_UNFOLD_WIDTH
;
469 for(i
=1; i
<unfoldRows
;) {
470 if(0==u_memcmp(p
, p
+UGENCASE_UNFOLD_WIDTH
, UGENCASE_UNFOLD_STRING_WIDTH
)) {
471 /* concatenate code point columns */
472 q
=p
+UGENCASE_UNFOLD_STRING_WIDTH
;
473 for(j
=1; j
<UGENCASE_UNFOLD_CP_WIDTH
&& q
[j
]!=0; ++j
) {}
474 for(k
=0; k
<UGENCASE_UNFOLD_CP_WIDTH
&& q
[UGENCASE_UNFOLD_WIDTH
+k
]!=0; ++j
, ++k
) {
475 q
[j
]=q
[UGENCASE_UNFOLD_WIDTH
+k
];
477 if(j
>UGENCASE_UNFOLD_CP_WIDTH
) {
478 fprintf(stderr
, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
479 (long)j
, UGENCASE_UNFOLD_CP_WIDTH
);
480 exit(U_BUFFER_OVERFLOW_ERROR
);
483 /* move following rows up one */
485 unfoldTop
-=UGENCASE_UNFOLD_WIDTH
;
486 u_memmove(p
+UGENCASE_UNFOLD_WIDTH
, p
+UGENCASE_UNFOLD_WIDTH
*2, (unfoldRows
-i
)*UGENCASE_UNFOLD_WIDTH
);
488 p
+=UGENCASE_UNFOLD_WIDTH
;
493 unfold
[UCASE_UNFOLD_ROWS
]=(UChar
)unfoldRows
;
496 puts("unfold data:");
499 for(i
=0; i
<unfoldRows
; ++i
) {
500 p
+=UGENCASE_UNFOLD_WIDTH
;
501 printf("[%2d] %04x %04x %04x <- %04x %04x\n",
502 (int)i
, p
[0], p
[1], p
[2], p
[3], p
[4]);
507 /* case closure ------------------------------------------------------------- */
510 addClosureMapping(UChar32 src
, UChar32 dest
) {
514 printf("add closure mapping U+%04lx->U+%04lx\n",
515 (unsigned long)src
, (unsigned long)dest
);
518 value
=upvec_getValue(pv
, src
, 0);
519 if(value
&UCASE_EXCEPTION
) {
520 Props
*p
=excProps
+(value
>>UGENCASE_EXC_SHIFT
);
523 /* append dest to src's closure array */
525 if(i
==LENGTHOF(p
->closure
)) {
526 fprintf(stderr
, "closure[] overflow for U+%04lx->U+%04lx\n",
527 (unsigned long)src
, (unsigned long)dest
);
528 exit(U_BUFFER_OVERFLOW_ERROR
);
529 } else if(p
->closure
[i
]==dest
) {
530 break; /* do not store duplicates */
531 } else if(p
->closure
[i
]==0) {
539 UErrorCode errorCode
;
542 * decode value into p2 (enough for makeException() to work properly),
543 * add the closure mapping,
544 * and set the new exception for src
549 if((value
&UCASE_TYPE_MASK
)>UCASE_NONE
) {
550 /* one simple case mapping, don't care which one */
551 next
=src
+((int16_t)value
>>UCASE_DELTA_SHIFT
);
553 if((value
&UCASE_TYPE_MASK
)==UCASE_LOWER
) {
554 p2
.upperCase
=p2
.titleCase
=next
;
559 } else if(value
&UCASE_DELTA_MASK
) {
560 fprintf(stderr
, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n",
562 exit(U_INTERNAL_PROGRAM_ERROR
);
565 value
&=~(UGENCASE_EXC_MASK
|UCASE_DELTA_MASK
); /* remove previous simple mapping */
566 value
|=(uint32_t)exceptionsCount
<<UGENCASE_EXC_SHIFT
;
567 value
|=UCASE_EXCEPTION
;
568 uprv_memcpy(excProps
+exceptionsCount
, &p2
, sizeof(p2
));
569 if(++exceptionsCount
==MAX_EXC_COUNT
) {
570 fprintf(stderr
, "gencase: too many exceptions\n");
571 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
574 errorCode
=U_ZERO_ERROR
;
575 if(!upvec_setValue(pv
, src
, src
+1, 0, value
, 0xffffffff, &errorCode
)) {
576 fprintf(stderr
, "gencase error: unable to set case mapping values, code: %s\n",
577 u_errorName(errorCode
));
584 * Find missing case mapping relationships and add mappings for case closure.
585 * This function starts from an "original" code point and recursively
586 * finds its case mappings and the case mappings of where it maps to.
588 * The recursion depth is capped at 3 nested calls of this function.
589 * In each call, the current code point is c, and the function enumerates
590 * all of c's simple (single-code point) case mappings.
591 * prev is the code point that case-mapped to c.
592 * prev2 is the code point that case-mapped to prev.
594 * The initial function call has prev2<0, prev<0, and c==orig
595 * (marking no code points).
596 * It enumerates c's case mappings and recurses without further action.
598 * The second-level function call has prev2<0, prev==orig, and c is
599 * the destination code point of one of prev's case mappings.
600 * The function checks if any of c's case mappings go back to orig
601 * and adds a closure mapping if not.
602 * In other words, it turns a case mapping relationship of
607 * The third-level function call has prev2==orig, prev>=0, and c is
608 * the destination code point of one of prev's case mappings.
609 * (And prev is the destination of one of prev2's case mappings.)
610 * The function checks if any of c's case mappings go back to orig
611 * and adds a closure mapping if not.
612 * In other words, it turns case mapping relationships of
613 * orig->prev->c or orig->prev<->c
615 * orig->prev->c->orig or orig->prev<->c->orig
617 * (Graphically, this closes a triangle.)
619 * With repeated application on all code points until no more closure mappings
620 * are added, all case equivalence groups get complete mappings.
621 * That is, in each group of code points with case relationships
622 * each code point will in the end have some mapping to each other
623 * code point in the group.
625 * @return TRUE if a closure mapping was added
628 addClosure(UChar32 orig
, UChar32 prev2
, UChar32 prev
, UChar32 c
, uint32_t value
) {
630 UBool someMappingsAdded
=FALSE
;
633 /* get the properties for c */
634 value
=upvec_getValue(pv
, c
, 0);
636 /* else if c==orig then c's value was passed in */
638 if(value
&UCASE_EXCEPTION
) {
642 Props
*p
=excProps
+(value
>>UGENCASE_EXC_SHIFT
);
645 * marker for whether any of c's mappings goes to orig
646 * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
648 UBool mapsToOrig
=(UBool
)(c
==orig
);
650 /* collect c's case mapping destinations in set[] */
651 if((next
=p
->upperCase
)!=0 && next
!=c
) {
654 if((next
=p
->lowerCase
)!=0 && next
!=c
) {
657 if(p
->upperCase
!=(next
=p
->titleCase
) && next
!=c
) {
660 if(p
->caseFolding
!=NULL
&& (next
=p
->caseFolding
->simple
)!=0 && next
!=c
) {
664 /* append c's current closure mappings to set[] */
665 for(i
=0; i
<LENGTHOF(p
->closure
) && (next
=p
->closure
[i
])!=0; ++i
) {
669 /* process all code points to which c case-maps */
670 for(i
=0; i
<count
; ++i
) {
671 next
=set
[i
]; /* next!=c */
674 mapsToOrig
=TRUE
; /* remember that we map to orig */
675 } else if(prev2
<0 && next
!=prev
) {
678 * we have reached maximum depth (prev2>=0) or
679 * this is a mapping to one of the previous code points (orig, prev, c)
681 someMappingsAdded
|=addClosure(orig
, prev
, c
, next
, 0);
686 addClosureMapping(c
, orig
);
690 if((value
&UCASE_TYPE_MASK
)>UCASE_NONE
) {
691 /* one simple case mapping, don't care which one */
692 next
=c
+((int16_t)value
>>UCASE_DELTA_SHIFT
);
696 * we have reached maximum depth (prev2>=0) or
697 * this is a mapping to one of the previous code points (orig, prev, c)
699 if(prev2
<0 && next
!=orig
&& next
!=prev
) {
700 someMappingsAdded
|=addClosure(orig
, prev
, c
, next
, 0);
703 if(c
!=orig
&& next
!=orig
) {
704 /* c does not map to orig, add a closure mapping c->orig */
705 addClosureMapping(c
, orig
);
712 return someMappingsAdded
;
720 UChar32 start
, limit
, c
, c2
;
722 UBool someMappingsAdded
;
725 * finalize the "unfold" data because we need to use it to add closure mappings
726 * for situations like FB05->"st"<-FB06
727 * where we would otherwise miss the FB05<->FB06 relationship
731 /* use the "unfold" data to add mappings */
733 /* p always points to the code points; this loop ignores the strings completely */
734 p
=unfold
+UGENCASE_UNFOLD_WIDTH
+UGENCASE_UNFOLD_STRING_WIDTH
;
736 for(i
=0; i
<unfoldRows
; p
+=UGENCASE_UNFOLD_WIDTH
, ++i
) {
738 U16_NEXT_UNSAFE(p
, j
, c
);
739 while(j
<UGENCASE_UNFOLD_CP_WIDTH
&& p
[j
]!=0) {
740 U16_NEXT_UNSAFE(p
, j
, c2
);
741 addClosure(c
, U_SENTINEL
, c
, c2
, 0);
746 puts("---- ---- ---- ---- (done with closures from unfolding)");
749 /* add further closure mappings from analyzing simple mappings */
751 someMappingsAdded
=FALSE
;
754 while((row
=upvec_getRow(pv
, i
, &start
, &limit
))!=NULL
) {
758 if(addClosure(start
, U_SENTINEL
, U_SENTINEL
, start
, value
)) {
759 someMappingsAdded
=TRUE
;
762 * stop this loop because pv was changed and row is not valid any more
763 * skip all rows below the current start
765 while((row
=upvec_getRow(pv
, i
, NULL
, &limit
))!=NULL
&& start
>=limit
) {
768 row
=NULL
; /* signal to continue with outer loop, without further ++i */
774 continue; /* see row=NULL above */
780 if(beVerbose
&& someMappingsAdded
) {
781 puts("---- ---- ---- ----");
783 } while(someMappingsAdded
);
786 /* exceptions --------------------------------------------------------------- */
788 /* get the string length from zero-terminated code points in a limited-length array */
790 getLengthOfCodePoints(const UChar32
*s
, int32_t maxLength
) {
793 for(i
=length
=0; i
<maxLength
&& s
[i
]!=0; ++i
) {
794 length
+=U16_LENGTH(s
[i
]);
800 fullMappingEqualsSimple(const UChar
*s
, UChar32 simple
, UChar32 c
) {
805 if(length
==0 || length
>U16_MAX_LENGTH
) {
809 U16_NEXT(s
, i
, length
, full
);
812 simple
=c
; /* UCD has no simple mapping if it's the same as the code point itself */
814 return (UBool
)(i
==length
&& full
==simple
);
818 makeException(uint32_t value
, Props
*p
) {
821 uint16_t excWord
, excIndex
, excTop
, i
, count
, length
, fullLengths
;
824 /* excIndex will be returned for storing in the trie word */
825 excIndex
=exceptionsTop
;
826 if(excIndex
>=UCASE_MAX_EXCEPTIONS
) {
827 fprintf(stderr
, "gencase error: too many exceptions words\n");
828 exit(U_BUFFER_OVERFLOW_ERROR
);
831 excTop
=excIndex
+1; /* +1 for excWord which will be stored at excIndex */
833 /* copy and shift the soft-dotted bits */
834 excWord
=((uint16_t)value
&UCASE_DOT_MASK
)<<UCASE_EXC_DOT_SHIFT
;
836 /* update maxFullLength */
837 if(p
->specialCasing
!=NULL
) {
838 length
=p
->specialCasing
->lowerCase
[0];
839 if(length
>maxFullLength
) {
840 maxFullLength
=length
;
842 length
=p
->specialCasing
->upperCase
[0];
843 if(length
>maxFullLength
) {
844 maxFullLength
=length
;
846 length
=p
->specialCasing
->titleCase
[0];
847 if(length
>maxFullLength
) {
848 maxFullLength
=length
;
851 if(p
->caseFolding
!=NULL
) {
852 length
=p
->caseFolding
->full
[0];
853 if(length
>maxFullLength
) {
854 maxFullLength
=length
;
858 /* set the bits for conditional mappings */
859 if(p
->specialCasing
!=NULL
&& p
->specialCasing
->isComplex
) {
860 excWord
|=UCASE_EXC_CONDITIONAL_SPECIAL
;
861 p
->specialCasing
=NULL
;
863 if(p
->caseFolding
!=NULL
&& p
->caseFolding
->simple
==0 && p
->caseFolding
->full
[0]==0) {
864 excWord
|=UCASE_EXC_CONDITIONAL_FOLD
;
870 * UCD stores no simple mappings when they are the same as the code point itself.
871 * SpecialCasing and CaseFolding do store simple mappings even if they are
872 * the same as the code point itself.
873 * Comparisons between simple regular mappings and simple special/folding
874 * mappings need to compensate for the difference by comparing with the
875 * original code point if a simple UCD mapping is missing (0).
878 /* remove redundant data */
879 if(p
->specialCasing
!=NULL
) {
880 /* do not store full mappings if they are the same as the simple ones */
881 if(fullMappingEqualsSimple(p
->specialCasing
->lowerCase
, p
->lowerCase
, p
->code
)) {
882 p
->specialCasing
->lowerCase
[0]=0;
884 if(fullMappingEqualsSimple(p
->specialCasing
->upperCase
, p
->upperCase
, p
->code
)) {
885 p
->specialCasing
->upperCase
[0]=0;
887 if(fullMappingEqualsSimple(p
->specialCasing
->titleCase
, p
->titleCase
, p
->code
)) {
888 p
->specialCasing
->titleCase
[0]=0;
891 if( p
->caseFolding
!=NULL
&&
892 fullMappingEqualsSimple(p
->caseFolding
->full
, p
->caseFolding
->simple
, p
->code
)
894 p
->caseFolding
->full
[0]=0;
897 /* write the optional slots */
901 if(p
->lowerCase
!=0) {
902 slots
[count
]=(uint32_t)p
->lowerCase
;
903 slotBits
|=slots
[count
];
905 excWord
|=U_MASK(UCASE_EXC_LOWER
);
907 if( p
->caseFolding
!=NULL
&&
908 p
->caseFolding
->simple
!=0 &&
910 p
->caseFolding
->simple
!=p
->lowerCase
:
911 p
->caseFolding
->simple
!=p
->code
)
913 slots
[count
]=(uint32_t)p
->caseFolding
->simple
;
914 slotBits
|=slots
[count
];
916 excWord
|=U_MASK(UCASE_EXC_FOLD
);
918 if(p
->upperCase
!=0) {
919 slots
[count
]=(uint32_t)p
->upperCase
;
920 slotBits
|=slots
[count
];
922 excWord
|=U_MASK(UCASE_EXC_UPPER
);
924 if(p
->upperCase
!=p
->titleCase
) {
925 if(p
->titleCase
!=0) {
926 slots
[count
]=(uint32_t)p
->titleCase
;
928 slots
[count
]=(uint32_t)p
->code
;
930 slotBits
|=slots
[count
];
932 excWord
|=U_MASK(UCASE_EXC_TITLE
);
935 /* length of case closure */
936 if(p
->closure
[0]!=0) {
937 length
=getLengthOfCodePoints(p
->closure
, LENGTHOF(p
->closure
));
938 slots
[count
]=(uint32_t)length
; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
939 slotBits
|=slots
[count
];
941 excWord
|=U_MASK(UCASE_EXC_CLOSURE
);
944 /* lengths of full case mapping strings, stored in the last slot */
946 if(p
->specialCasing
!=NULL
) {
947 fullLengths
=p
->specialCasing
->lowerCase
[0];
948 fullLengths
|=p
->specialCasing
->upperCase
[0]<<8;
949 fullLengths
|=p
->specialCasing
->titleCase
[0]<<12;
951 if(p
->caseFolding
!=NULL
) {
952 fullLengths
|=p
->caseFolding
->full
[0]<<4;
955 slots
[count
]=fullLengths
;
956 slotBits
|=slots
[count
];
958 excWord
|=U_MASK(UCASE_EXC_FULL_MAPPINGS
);
962 doubleSlots
=(UBool
)(slotBits
>0xffff);
964 for(i
=0; i
<count
; ++i
) {
965 exceptions
[excTop
++]=(uint16_t)slots
[i
];
968 excWord
|=UCASE_EXC_DOUBLE_SLOTS
;
969 for(i
=0; i
<count
; ++i
) {
970 exceptions
[excTop
++]=(uint16_t)(slots
[i
]>>16);
971 exceptions
[excTop
++]=(uint16_t)slots
[i
];
975 /* write the full case mapping strings */
976 if(p
->specialCasing
!=NULL
) {
977 length
=(uint16_t)p
->specialCasing
->lowerCase
[0];
978 u_memcpy((UChar
*)exceptions
+excTop
, p
->specialCasing
->lowerCase
+1, length
);
981 if(p
->caseFolding
!=NULL
) {
982 length
=(uint16_t)p
->caseFolding
->full
[0];
983 u_memcpy((UChar
*)exceptions
+excTop
, p
->caseFolding
->full
+1, length
);
986 if(p
->specialCasing
!=NULL
) {
987 length
=(uint16_t)p
->specialCasing
->upperCase
[0];
988 u_memcpy((UChar
*)exceptions
+excTop
, p
->specialCasing
->upperCase
+1, length
);
991 length
=(uint16_t)p
->specialCasing
->titleCase
[0];
992 u_memcpy((UChar
*)exceptions
+excTop
, p
->specialCasing
->titleCase
+1, length
);
996 /* write the closure data */
997 if(p
->closure
[0]!=0) {
1000 for(i
=0; i
<LENGTHOF(p
->closure
) && (c
=p
->closure
[i
])!=0; ++i
) {
1001 U16_APPEND_UNSAFE((UChar
*)exceptions
, excTop
, c
);
1005 exceptionsTop
=excTop
;
1007 /* write the main exceptions word */
1008 exceptions
[excIndex
]=excWord
;
1021 while((row
=upvec_getRow(pv
, i
, NULL
, NULL
))!=NULL
) {
1023 if(value
&UCASE_EXCEPTION
) {
1024 excIndex
=makeException(value
, excProps
+(value
>>UGENCASE_EXC_SHIFT
));
1025 *row
=(value
&~(UGENCASE_EXC_MASK
|UCASE_EXC_MASK
))|(excIndex
<<UCASE_EXC_SHIFT
);
1031 /* generate output data ----------------------------------------------------- */
1034 generateData(const char *dataDir
, UBool csource
) {
1035 static int32_t indexes
[UCASE_IX_TOP
]={
1038 static uint8_t trieBlock
[40000];
1040 const uint32_t *row
;
1041 UChar32 start
, limit
;
1044 UNewDataMemory
*pData
;
1046 UErrorCode errorCode
=U_ZERO_ERROR
;
1050 pTrie
=utrie_open(NULL
, NULL
, 20000, 0, 0, TRUE
);
1052 fprintf(stderr
, "gencase error: unable to create a UNewTrie\n");
1053 exit(U_MEMORY_ALLOCATION_ERROR
);
1056 for(i
=0; (row
=upvec_getRow(pv
, i
, &start
, &limit
))!=NULL
; ++i
) {
1057 if(!utrie_setRange32(pTrie
, start
, limit
, *row
, TRUE
)) {
1058 fprintf(stderr
, "gencase error: unable to set trie value (overflow)\n");
1059 exit(U_BUFFER_OVERFLOW_ERROR
);
1063 trieSize
=utrie_serialize(pTrie
, trieBlock
, sizeof(trieBlock
), NULL
, TRUE
, &errorCode
);
1064 if(U_FAILURE(errorCode
)) {
1065 fprintf(stderr
, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode
), (long)trieSize
);
1069 indexes
[UCASE_IX_EXC_LENGTH
]=exceptionsTop
;
1070 indexes
[UCASE_IX_TRIE_SIZE
]=trieSize
;
1071 indexes
[UCASE_IX_UNFOLD_LENGTH
]=unfoldTop
;
1072 indexes
[UCASE_IX_LENGTH
]=(int32_t)sizeof(indexes
)+trieSize
+2*exceptionsTop
+2*unfoldTop
;
1074 indexes
[UCASE_IX_MAX_FULL_LENGTH
]=maxFullLength
;
1077 printf("trie size in bytes: %5d\n", (int)trieSize
);
1078 printf("number of code points with exceptions: %5d\n", exceptionsCount
);
1079 printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop
);
1080 printf("size in bytes of reverse foldings: %5d\n", 2*unfoldTop
);
1081 printf("data size: %5d\n", (int)indexes
[UCASE_IX_LENGTH
]);
1085 /* write .c file for hardcoded data */
1086 UTrie trie
={ NULL
};
1089 utrie_unserialize(&trie
, trieBlock
, trieSize
, &errorCode
);
1090 if(U_FAILURE(errorCode
)) {
1093 "gencase error: failed to utrie_unserialize(ucase.icu trie) - %s\n",
1094 u_errorName(errorCode
));
1098 f
=usrc_create(dataDir
, "ucase_props_data.c");
1101 "static const UVersionInfo ucase_props_dataVersion={",
1102 dataInfo
.dataVersion
, 8, 4,
1105 "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={",
1106 indexes
, 32, UCASE_IX_TOP
,
1108 usrc_writeUTrieArrays(f
,
1109 "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL
,
1113 "static const uint16_t ucase_props_exceptions[%ld]={\n",
1114 exceptions
, 16, exceptionsTop
,
1117 "static const uint16_t ucase_props_unfold[%ld]={\n",
1118 unfold
, 16, unfoldTop
,
1121 "static const UCaseProps ucase_props_singleton={\n"
1123 " ucase_props_indexes,\n"
1124 " ucase_props_exceptions,\n"
1125 " ucase_props_unfold,\n",
1127 usrc_writeUTrieStruct(f
,
1129 &trie
, "ucase_props_trieIndex", NULL
, NULL
,
1131 usrc_writeArray(f
, " { ", dataInfo
.formatVersion
, 8, 4, " }\n");
1136 /* write the data */
1137 pData
=udata_create(dataDir
, UCASE_DATA_TYPE
, UCASE_DATA_NAME
, &dataInfo
,
1138 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, &errorCode
);
1139 if(U_FAILURE(errorCode
)) {
1140 fprintf(stderr
, "gencase: unable to create data memory, %s\n", u_errorName(errorCode
));
1144 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
1145 udata_writeBlock(pData
, trieBlock
, trieSize
);
1146 udata_writeBlock(pData
, exceptions
, 2*exceptionsTop
);
1147 udata_writeBlock(pData
, unfold
, 2*unfoldTop
);
1150 dataLength
=udata_finish(pData
, &errorCode
);
1151 if(U_FAILURE(errorCode
)) {
1152 fprintf(stderr
, "gencase: error %d writing the output file\n", errorCode
);
1156 if(dataLength
!=indexes
[UCASE_IX_LENGTH
]) {
1157 fprintf(stderr
, "gencase: data length %ld != calculated size %d\n",
1158 dataLength
, (int)indexes
[UCASE_IX_LENGTH
]);
1159 exit(U_INTERNAL_PROGRAM_ERROR
);
1167 * Hey, Emacs, please set the following:
1170 * indent-tabs-mode: nil