2 *******************************************************************************
4 * Copyright (C) 2004-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
16 * Store Unicode case mapping properties efficiently for
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/ustring.h"
30 #include "unicode/udata.h"
36 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
38 /* Unicode case mapping properties file format ---------------------------------
40 The file format prepared and written here contains several data
41 structures that store indexes or data.
43 Before the data contents described below, there are the headers required by
44 the udata API for loading ICU data. Especially, a UDataInfo structure
45 precedes the actual data. It contains platform properties values and the
48 The following is a description of format version 1.1 .
50 Format version 1.1 adds data for case closure.
52 The file contains the following structures:
54 const int32_t indexes[i0] with values i0, i1, ...:
55 (see UCASE_IX_... constants for names of indexes)
57 i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
58 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
59 i2 trieSize; -- size in bytes of the case mapping properties trie
60 i3 exceptionsLength; -- length in uint16_t of the exceptions array
61 i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
63 i5..i14 reservedIndexes; -- reserved values; 0 for now
65 i15 maxFullLength; -- maximum length of a full case mapping/folding string
68 Serialized trie, see utrie.h;
70 const uint16_t exceptions[exceptionsLength];
72 const UChar unfold[unfoldLength];
78 15..4 unsigned exception index
81 15..6 signed delta to simple case mapping code point
82 (add delta to input code point)
84 6 the code point is case-ignorable
85 (U+0307 is also case-ignorable but has an exception)
87 5..4 0 normal character with cc=0
88 1 soft-dotted character
101 A sub-array of the exceptions array is indexed by the exception index in a
103 The sub-array consists of the following fields:
105 uint16_t optional values [];
106 UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
108 excWord: (see UCASE_EXC_...)
110 15 conditional case folding
111 14 conditional special casing
112 13..12 same as non-exception trie data bits 5..4
113 moved here because the exception index needs more bits than the delta
114 0 normal character with cc=0
115 1 soft-dotted character
119 8 if set, then for each optional-value slot there are 2 uint16_t values
120 (high and low parts of 32-bit values)
121 instead of single ones
122 7.. 0 bits for which optional value is present
124 Optional-value slots:
125 0 lowercase mapping (code point)
126 1 case folding (code point)
127 2 uppercase mapping (code point)
128 3 titlecase mapping (code point)
131 6 closure mappings (new in format version 1.1)
132 7 there is at least one full (string) case mapping
133 the length of each is encoded in a nibble of this optional value,
134 and the strings follow this optional value in the same order:
135 lower/fold/upper/title
137 The optional closure mappings value is used as follows:
138 Bits 0..3 contain the length of a string of code points for case closure.
139 The string immediately follows the full case mappings, or the closure value
140 slot if there are no full case mappings.
141 Bits 4..15 are reserved and could be used in the future to indicate the
142 number of strings for case closure.
143 Complete case closure for a code point is given by the union of all simple
144 and full case mappings and foldings, plus the case closure code points
145 (and potentially, in the future, case closure strings).
147 For space saving, some values are not stored. Lookups are as follows:
148 - If special casing is conditional, then no full lower/upper/title mapping
150 - If case folding is conditional, then no simple or full case foldings are
152 - Fall back in this order:
153 full (string) mapping -- if full mappings are used
154 simple (code point) mapping of the same type
155 simple fold->simple lower
156 simple title->simple upper
157 finally, the original code point (no mapping)
159 This fallback order is strict:
160 In particular, the fallback from full case folding is to simple case folding,
161 not to full lowercase mapping.
163 Reverse case folding data ("unfold") array: (new in format version 1.1)
165 This array stores some miscellaneous values followed by a table. The data maps
166 back from multi-character strings to their original code points, for use
169 The table contains two columns of strings.
170 The string in the first column is the case folding of each of the code points
171 in the second column. The strings are terminated with NUL or by the end of the
172 column, whichever comes first.
174 The miscellaneous data takes up one pseudo-row and includes:
176 - number of UChars per row
177 - number of UChars in the left (folding string) column
179 The table is sorted by its first column. Values in the first column are unique.
181 ----------------------------------------------------------------------------- */
183 /* UDataInfo cf. udata.h */
184 static UDataInfo dataInfo
={
193 /* dataFormat="cAsE" */
194 { UCASE_FMT_0
, UCASE_FMT_1
, UCASE_FMT_2
, UCASE_FMT_3
},
195 { 1, 1, UTRIE_SHIFT
, UTRIE_INDEX_SHIFT
}, /* formatVersion */
196 { 4, 0, 1, 0 } /* dataVersion */
200 /* maximum number of exceptions expected */
204 /* exceptions values */
205 static uint16_t exceptions
[UCASE_MAX_EXCEPTIONS
+100];
206 static uint16_t exceptionsTop
=0;
207 static Props excProps
[MAX_EXC_COUNT
];
208 static uint16_t exceptionsCount
=0;
210 /* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
211 static int32_t maxFullLength
=U16_MAX_LENGTH
;
213 /* reverse case folding ("unfold") data */
214 static UChar unfold
[UGENCASE_UNFOLD_MAX_ROWS
*UGENCASE_UNFOLD_WIDTH
]={
215 0, UGENCASE_UNFOLD_WIDTH
, UGENCASE_UNFOLD_STRING_WIDTH
, 0, 0
217 static uint16_t unfoldRows
=0;
218 static uint16_t unfoldTop
=UGENCASE_UNFOLD_WIDTH
;
220 /* Unicode versions --------------------------------------------------------- */
222 static const UVersionInfo
235 int32_t ucdVersion
=UNI_4_1
;
238 findUnicodeVersion(const UVersionInfo version
) {
241 for(i
=0; /* while(version>unicodeVersions[i]) {} */
242 i
<UNI_VER_COUNT
&& uprv_memcmp(version
, unicodeVersions
[i
], 4)>0;
244 if(0<i
&& i
<UNI_VER_COUNT
&& uprv_memcmp(version
, unicodeVersions
[i
], 4)<0) {
245 --i
; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
247 return i
; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
251 setUnicodeVersion(const char *v
) {
252 UVersionInfo version
;
253 u_versionFromString(version
, v
);
254 uprv_memcpy(dataInfo
.dataVersion
, version
, 4);
255 ucdVersion
=findUnicodeVersion(version
);
258 /* -------------------------------------------------------------------------- */
261 addUnfolding(UChar32 c
, const UChar
*s
, int32_t length
) {
264 if(length
>UGENCASE_UNFOLD_STRING_WIDTH
) {
265 fprintf(stderr
, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
266 (long)length
, UGENCASE_UNFOLD_STRING_WIDTH
);
267 exit(U_INTERNAL_PROGRAM_ERROR
);
269 if(unfoldTop
>=LENGTHOF(unfold
)) {
270 fprintf(stderr
, "gencase error: too many multi-character case foldings\n");
271 exit(U_BUFFER_OVERFLOW_ERROR
);
273 u_memset(unfold
+unfoldTop
, 0, UGENCASE_UNFOLD_WIDTH
);
274 u_memcpy(unfold
+unfoldTop
, s
, length
);
276 i
=unfoldTop
+UGENCASE_UNFOLD_STRING_WIDTH
;
277 U16_APPEND_UNSAFE(unfold
, i
, c
);
280 unfoldTop
+=UGENCASE_UNFOLD_WIDTH
;
283 /* store a character's properties ------------------------------------------- */
287 UErrorCode errorCode
;
288 uint32_t value
, oldValue
;
290 UBool isCaseIgnorable
;
292 /* get the non-UnicodeData.txt properties */
293 value
=oldValue
=upvec_getValue(pv
, p
->code
, 0);
295 /* default: map to self */
298 if(p
->gc
==U_TITLECASE_LETTER
) {
299 /* the Titlecase property is read late, from UnicodeData.txt */
303 if(p
->upperCase
!=0) {
304 /* uppercase mapping as delta if the character is lowercase */
305 if((value
&UCASE_TYPE_MASK
)==UCASE_LOWER
) {
306 delta
=p
->upperCase
-p
->code
;
308 value
|=UCASE_EXCEPTION
;
311 if(p
->lowerCase
!=0) {
312 /* lowercase mapping as delta if the character is uppercase or titlecase */
313 if((value
&UCASE_TYPE_MASK
)>=UCASE_UPPER
) {
314 delta
=p
->lowerCase
-p
->code
;
316 value
|=UCASE_EXCEPTION
;
319 if(p
->upperCase
!=p
->titleCase
) {
320 value
|=UCASE_EXCEPTION
;
322 if(p
->closure
[0]!=0) {
323 value
|=UCASE_EXCEPTION
;
325 if(p
->specialCasing
!=NULL
) {
326 value
|=UCASE_EXCEPTION
;
328 if(p
->caseFolding
!=NULL
) {
329 value
|=UCASE_EXCEPTION
;
332 if(delta
<UCASE_MIN_DELTA
|| UCASE_MAX_DELTA
<delta
) {
333 value
|=UCASE_EXCEPTION
;
337 if(value
&UCASE_DOT_MASK
) {
338 fprintf(stderr
, "gencase: a soft-dotted character has cc!=0\n");
339 exit(U_INTERNAL_PROGRAM_ERROR
);
344 value
|=UCASE_OTHER_ACCENT
;
348 /* encode case-ignorable as delta==1 on uncased characters */
349 isCaseIgnorable
=FALSE
;
350 if((value
&UCASE_TYPE_MASK
)==UCASE_NONE
) {
351 if(ucdVersion
>=UNI_4_1
) {
352 /* Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk */
354 (U_MASK(p
->gc
)&(U_GC_MN_MASK
|U_GC_ME_MASK
|U_GC_CF_MASK
|U_GC_LM_MASK
|U_GC_SK_MASK
))!=0 ||
355 ((upvec_getValue(pv
, p
->code
, 1)>>UGENCASE_IS_MID_LETTER_SHIFT
)&1)!=0
357 isCaseIgnorable
=TRUE
;
360 /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */
362 (U_MASK(p
->gc
)&(U_GC_MN_MASK
|U_GC_ME_MASK
|U_GC_CF_MASK
|U_GC_LM_MASK
|U_GC_SK_MASK
))!=0 ||
363 p
->code
==0x27 || p
->code
==0xad || p
->code
==0x2019
365 isCaseIgnorable
=TRUE
;
370 if(isCaseIgnorable
&& p
->code
!=0x307) {
372 * We use one of the delta/exception bits, which works because we only
373 * store the case-ignorable flag for uncased characters.
374 * There is no delta for uncased characters (see checks above).
375 * If there is an exception for an uncased, case-ignorable character
376 * (although there should not be any case mappings if it's uncased)
377 * then we have a problem.
378 * There is one character which is case-ignorable but has an exception:
379 * U+0307 is uncased, Mn, has conditional special casing and
380 * is therefore handled in code instead.
382 if(value
&UCASE_EXCEPTION
) {
383 fprintf(stderr
, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
384 (unsigned long)p
->code
);
385 exit(U_INTERNAL_PROGRAM_ERROR
);
391 /* handle exceptions */
392 if(value
&UCASE_EXCEPTION
) {
393 /* simply store exceptions for later processing and encoding */
394 value
|=(uint32_t)exceptionsCount
<<UGENCASE_EXC_SHIFT
;
395 uprv_memcpy(excProps
+exceptionsCount
, p
, sizeof(*p
));
396 if(++exceptionsCount
==MAX_EXC_COUNT
) {
397 fprintf(stderr
, "gencase: too many exceptions\n");
398 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
401 /* store the simple case mapping delta */
402 value
|=((uint32_t)delta
<<UCASE_DELTA_SHIFT
)&UCASE_DELTA_MASK
;
405 errorCode
=U_ZERO_ERROR
;
406 if( value
!=oldValue
&&
407 !upvec_setValue(pv
, p
->code
, p
->code
+1, 0, value
, 0xffffffff, &errorCode
)
409 fprintf(stderr
, "gencase error: unable to set case mapping values, code: %s\n",
410 u_errorName(errorCode
));
414 /* add the multi-character case folding to the "unfold" data */
415 if(p
->caseFolding
!=NULL
) {
416 int32_t length
=p
->caseFolding
->full
[0];
417 if(length
>1 && u_strHasMoreChar32Than(p
->caseFolding
->full
+1, length
, 1)) {
418 addUnfolding(p
->code
, p
->caseFolding
->full
+1, length
);
424 addCaseSensitive(UChar32 first
, UChar32 last
) {
425 UErrorCode errorCode
=U_ZERO_ERROR
;
426 if(!upvec_setValue(pv
, first
, last
+1, 0, UCASE_SENSITIVE
, UCASE_SENSITIVE
, &errorCode
)) {
427 fprintf(stderr
, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
428 u_errorName(errorCode
));
433 /* finalize reverse case folding ("unfold") data ---------------------------- */
435 static int32_t U_CALLCONV
436 compareUnfold(const void *context
, const void *left
, const void *right
) {
437 return u_memcmp((const UChar
*)left
, (const UChar
*)right
, UGENCASE_UNFOLD_WIDTH
);
443 iDot
[2]= { 0x69, 0x307 };
447 UErrorCode errorCode
;
450 * add a case folding that we missed because it's conditional:
451 * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
453 addUnfolding(0x130, iDot
, 2);
456 errorCode
=U_ZERO_ERROR
;
457 uprv_sortArray(unfold
+UGENCASE_UNFOLD_WIDTH
, unfoldRows
, UGENCASE_UNFOLD_WIDTH
*2,
458 compareUnfold
, NULL
, FALSE
, &errorCode
);
460 /* make unique-string rows by merging adjacent ones' code point columns */
462 /* make p point to row i-1 */
463 p
=(UChar
*)unfold
+UGENCASE_UNFOLD_WIDTH
;
465 for(i
=1; i
<unfoldRows
;) {
466 if(0==u_memcmp(p
, p
+UGENCASE_UNFOLD_WIDTH
, UGENCASE_UNFOLD_STRING_WIDTH
)) {
467 /* concatenate code point columns */
468 q
=p
+UGENCASE_UNFOLD_STRING_WIDTH
;
469 for(j
=1; j
<UGENCASE_UNFOLD_CP_WIDTH
&& q
[j
]!=0; ++j
) {}
470 for(k
=0; k
<UGENCASE_UNFOLD_CP_WIDTH
&& q
[UGENCASE_UNFOLD_WIDTH
+k
]!=0; ++j
, ++k
) {
471 q
[j
]=q
[UGENCASE_UNFOLD_WIDTH
+k
];
473 if(j
>UGENCASE_UNFOLD_CP_WIDTH
) {
474 fprintf(stderr
, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
475 (long)j
, UGENCASE_UNFOLD_CP_WIDTH
);
476 exit(U_BUFFER_OVERFLOW_ERROR
);
479 /* move following rows up one */
481 unfoldTop
-=UGENCASE_UNFOLD_WIDTH
;
482 u_memmove(p
+UGENCASE_UNFOLD_WIDTH
, p
+UGENCASE_UNFOLD_WIDTH
*2, (unfoldRows
-i
)*UGENCASE_UNFOLD_WIDTH
);
484 p
+=UGENCASE_UNFOLD_WIDTH
;
489 unfold
[UCASE_UNFOLD_ROWS
]=(UChar
)unfoldRows
;
492 puts("unfold data:");
495 for(i
=0; i
<unfoldRows
; ++i
) {
496 p
+=UGENCASE_UNFOLD_WIDTH
;
497 printf("[%2d] %04x %04x %04x <- %04x %04x\n",
498 (int)i
, p
[0], p
[1], p
[2], p
[3], p
[4]);
503 /* case closure ------------------------------------------------------------- */
506 addClosureMapping(UChar32 src
, UChar32 dest
) {
510 printf("add closure mapping U+%04lx->U+%04lx\n",
511 (unsigned long)src
, (unsigned long)dest
);
514 value
=upvec_getValue(pv
, src
, 0);
515 if(value
&UCASE_EXCEPTION
) {
516 Props
*p
=excProps
+(value
>>UGENCASE_EXC_SHIFT
);
519 /* append dest to src's closure array */
521 if(i
==LENGTHOF(p
->closure
)) {
522 fprintf(stderr
, "closure[] overflow for U+%04lx->U+%04lx\n",
523 (unsigned long)src
, (unsigned long)dest
);
524 exit(U_BUFFER_OVERFLOW_ERROR
);
525 } else if(p
->closure
[i
]==dest
) {
526 break; /* do not store duplicates */
527 } else if(p
->closure
[i
]==0) {
535 UErrorCode errorCode
;
538 * decode value into p2 (enough for makeException() to work properly),
539 * add the closure mapping,
540 * and set the new exception for src
545 if((value
&UCASE_TYPE_MASK
)>UCASE_NONE
) {
546 /* one simple case mapping, don't care which one */
547 next
=src
+((int16_t)value
>>UCASE_DELTA_SHIFT
);
549 if((value
&UCASE_TYPE_MASK
)==UCASE_LOWER
) {
550 p2
.upperCase
=p2
.titleCase
=next
;
555 } else if(value
&UCASE_DELTA_MASK
) {
556 fprintf(stderr
, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n",
558 exit(U_INTERNAL_PROGRAM_ERROR
);
561 value
&=~(UGENCASE_EXC_MASK
|UCASE_DELTA_MASK
); /* remove previous simple mapping */
562 value
|=(uint32_t)exceptionsCount
<<UGENCASE_EXC_SHIFT
;
563 value
|=UCASE_EXCEPTION
;
564 uprv_memcpy(excProps
+exceptionsCount
, &p2
, sizeof(p2
));
565 if(++exceptionsCount
==MAX_EXC_COUNT
) {
566 fprintf(stderr
, "gencase: too many exceptions\n");
567 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
570 errorCode
=U_ZERO_ERROR
;
571 if(!upvec_setValue(pv
, src
, src
+1, 0, value
, 0xffffffff, &errorCode
)) {
572 fprintf(stderr
, "gencase error: unable to set case mapping values, code: %s\n",
573 u_errorName(errorCode
));
580 * Find missing case mapping relationships and add mappings for case closure.
581 * This function starts from an "original" code point and recursively
582 * finds its case mappings and the case mappings of where it maps to.
584 * The recursion depth is capped at 3 nested calls of this function.
585 * In each call, the current code point is c, and the function enumerates
586 * all of c's simple (single-code point) case mappings.
587 * prev is the code point that case-mapped to c.
588 * prev2 is the code point that case-mapped to prev.
590 * The initial function call has prev2<0, prev<0, and c==orig
591 * (marking no code points).
592 * It enumerates c's case mappings and recurses without further action.
594 * The second-level function call has prev2<0, prev==orig, and c is
595 * the destination code point of one of prev's case mappings.
596 * The function checks if any of c's case mappings go back to orig
597 * and adds a closure mapping if not.
598 * In other words, it turns a case mapping relationship of
603 * The third-level function call has prev2==orig, prev>=0, and c is
604 * the destination code point of one of prev's case mappings.
605 * (And prev is the destination of one of prev2's case mappings.)
606 * The function checks if any of c's case mappings go back to orig
607 * and adds a closure mapping if not.
608 * In other words, it turns case mapping relationships of
609 * orig->prev->c or orig->prev<->c
611 * orig->prev->c->orig or orig->prev<->c->orig
613 * (Graphically, this closes a triangle.)
615 * With repeated application on all code points until no more closure mappings
616 * are added, all case equivalence groups get complete mappings.
617 * That is, in each group of code points with case relationships
618 * each code point will in the end have some mapping to each other
619 * code point in the group.
621 * @return TRUE if a closure mapping was added
624 addClosure(UChar32 orig
, UChar32 prev2
, UChar32 prev
, UChar32 c
, uint32_t value
) {
626 UBool someMappingsAdded
=FALSE
;
629 /* get the properties for c */
630 value
=upvec_getValue(pv
, c
, 0);
632 /* else if c==orig then c's value was passed in */
634 if(value
&UCASE_EXCEPTION
) {
638 Props
*p
=excProps
+(value
>>UGENCASE_EXC_SHIFT
);
641 * marker for whether any of c's mappings goes to orig
642 * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
644 UBool mapsToOrig
=(UBool
)(c
==orig
);
646 /* collect c's case mapping destinations in set[] */
647 if((next
=p
->upperCase
)!=0 && next
!=c
) {
650 if((next
=p
->lowerCase
)!=0 && next
!=c
) {
653 if(p
->upperCase
!=(next
=p
->titleCase
) && next
!=c
) {
656 if(p
->caseFolding
!=NULL
&& (next
=p
->caseFolding
->simple
)!=0 && next
!=c
) {
660 /* append c's current closure mappings to set[] */
661 for(i
=0; i
<LENGTHOF(p
->closure
) && (next
=p
->closure
[i
])!=0; ++i
) {
665 /* process all code points to which c case-maps */
666 for(i
=0; i
<count
; ++i
) {
667 next
=set
[i
]; /* next!=c */
670 mapsToOrig
=TRUE
; /* remember that we map to orig */
671 } else if(prev2
<0 && next
!=prev
) {
674 * we have reached maximum depth (prev2>=0) or
675 * this is a mapping to one of the previous code points (orig, prev, c)
677 someMappingsAdded
|=addClosure(orig
, prev
, c
, next
, 0);
682 addClosureMapping(c
, orig
);
686 if((value
&UCASE_TYPE_MASK
)>UCASE_NONE
) {
687 /* one simple case mapping, don't care which one */
688 next
=c
+((int16_t)value
>>UCASE_DELTA_SHIFT
);
692 * we have reached maximum depth (prev2>=0) or
693 * this is a mapping to one of the previous code points (orig, prev, c)
695 if(prev2
<0 && next
!=orig
&& next
!=prev
) {
696 someMappingsAdded
|=addClosure(orig
, prev
, c
, next
, 0);
699 if(c
!=orig
&& next
!=orig
) {
700 /* c does not map to orig, add a closure mapping c->orig */
701 addClosureMapping(c
, orig
);
708 return someMappingsAdded
;
716 UChar32 start
, limit
, c
, c2
;
718 UBool someMappingsAdded
;
721 * finalize the "unfold" data because we need to use it to add closure mappings
722 * for situations like FB05->"st"<-FB06
723 * where we would otherwise miss the FB05<->FB06 relationship
727 /* use the "unfold" data to add mappings */
729 /* p always points to the code points; this loop ignores the strings completely */
730 p
=unfold
+UGENCASE_UNFOLD_WIDTH
+UGENCASE_UNFOLD_STRING_WIDTH
;
732 for(i
=0; i
<unfoldRows
; p
+=UGENCASE_UNFOLD_WIDTH
, ++i
) {
734 U16_NEXT_UNSAFE(p
, j
, c
);
735 while(j
<UGENCASE_UNFOLD_CP_WIDTH
&& p
[j
]!=0) {
736 U16_NEXT_UNSAFE(p
, j
, c2
);
737 addClosure(c
, U_SENTINEL
, c
, c2
, 0);
742 puts("---- ---- ---- ---- (done with closures from unfolding)");
745 /* add further closure mappings from analyzing simple mappings */
747 someMappingsAdded
=FALSE
;
750 while((row
=upvec_getRow(pv
, i
, &start
, &limit
))!=NULL
) {
754 if(addClosure(start
, U_SENTINEL
, U_SENTINEL
, start
, value
)) {
755 someMappingsAdded
=TRUE
;
758 * stop this loop because pv was changed and row is not valid any more
759 * skip all rows below the current start
761 while((row
=upvec_getRow(pv
, i
, NULL
, &limit
))!=NULL
&& start
>=limit
) {
764 row
=NULL
; /* signal to continue with outer loop, without further ++i */
770 continue; /* see row=NULL above */
776 if(beVerbose
&& someMappingsAdded
) {
777 puts("---- ---- ---- ----");
779 } while(someMappingsAdded
);
782 /* exceptions --------------------------------------------------------------- */
784 /* get the string length from zero-terminated code points in a limited-length array */
786 getLengthOfCodePoints(const UChar32
*s
, int32_t maxLength
) {
789 for(i
=length
=0; i
<maxLength
&& s
[i
]!=0; ++i
) {
790 length
+=U16_LENGTH(s
[i
]);
796 fullMappingEqualsSimple(const UChar
*s
, UChar32 simple
, UChar32 c
) {
801 if(length
==0 || length
>U16_MAX_LENGTH
) {
805 U16_NEXT(s
, i
, length
, full
);
808 simple
=c
; /* UCD has no simple mapping if it's the same as the code point itself */
810 return (UBool
)(i
==length
&& full
==simple
);
814 makeException(uint32_t value
, Props
*p
) {
817 uint16_t excWord
, excIndex
, excTop
, i
, count
, length
, fullLengths
;
820 /* excIndex will be returned for storing in the trie word */
821 excIndex
=exceptionsTop
;
822 if(excIndex
>=UCASE_MAX_EXCEPTIONS
) {
823 fprintf(stderr
, "gencase error: too many exceptions words\n");
824 exit(U_BUFFER_OVERFLOW_ERROR
);
827 excTop
=excIndex
+1; /* +1 for excWord which will be stored at excIndex */
829 /* copy and shift the soft-dotted bits */
830 excWord
=((uint16_t)value
&UCASE_DOT_MASK
)<<UCASE_EXC_DOT_SHIFT
;
832 /* update maxFullLength */
833 if(p
->specialCasing
!=NULL
) {
834 length
=p
->specialCasing
->lowerCase
[0];
835 if(length
>maxFullLength
) {
836 maxFullLength
=length
;
838 length
=p
->specialCasing
->upperCase
[0];
839 if(length
>maxFullLength
) {
840 maxFullLength
=length
;
842 length
=p
->specialCasing
->titleCase
[0];
843 if(length
>maxFullLength
) {
844 maxFullLength
=length
;
847 if(p
->caseFolding
!=NULL
) {
848 length
=p
->caseFolding
->full
[0];
849 if(length
>maxFullLength
) {
850 maxFullLength
=length
;
854 /* set the bits for conditional mappings */
855 if(p
->specialCasing
!=NULL
&& p
->specialCasing
->isComplex
) {
856 excWord
|=UCASE_EXC_CONDITIONAL_SPECIAL
;
857 p
->specialCasing
=NULL
;
859 if(p
->caseFolding
!=NULL
&& p
->caseFolding
->simple
==0 && p
->caseFolding
->full
[0]==0) {
860 excWord
|=UCASE_EXC_CONDITIONAL_FOLD
;
866 * UCD stores no simple mappings when they are the same as the code point itself.
867 * SpecialCasing and CaseFolding do store simple mappings even if they are
868 * the same as the code point itself.
869 * Comparisons between simple regular mappings and simple special/folding
870 * mappings need to compensate for the difference by comparing with the
871 * original code point if a simple UCD mapping is missing (0).
874 /* remove redundant data */
875 if(p
->specialCasing
!=NULL
) {
876 /* do not store full mappings if they are the same as the simple ones */
877 if(fullMappingEqualsSimple(p
->specialCasing
->lowerCase
, p
->lowerCase
, p
->code
)) {
878 p
->specialCasing
->lowerCase
[0]=0;
880 if(fullMappingEqualsSimple(p
->specialCasing
->upperCase
, p
->upperCase
, p
->code
)) {
881 p
->specialCasing
->upperCase
[0]=0;
883 if(fullMappingEqualsSimple(p
->specialCasing
->titleCase
, p
->titleCase
, p
->code
)) {
884 p
->specialCasing
->titleCase
[0]=0;
887 if( p
->caseFolding
!=NULL
&&
888 fullMappingEqualsSimple(p
->caseFolding
->full
, p
->caseFolding
->simple
, p
->code
)
890 p
->caseFolding
->full
[0]=0;
893 /* write the optional slots */
897 if(p
->lowerCase
!=0) {
898 slots
[count
]=(uint32_t)p
->lowerCase
;
899 slotBits
|=slots
[count
];
901 excWord
|=U_MASK(UCASE_EXC_LOWER
);
903 if( p
->caseFolding
!=NULL
&&
904 p
->caseFolding
->simple
!=0 &&
906 p
->caseFolding
->simple
!=p
->lowerCase
:
907 p
->caseFolding
->simple
!=p
->code
)
909 slots
[count
]=(uint32_t)p
->caseFolding
->simple
;
910 slotBits
|=slots
[count
];
912 excWord
|=U_MASK(UCASE_EXC_FOLD
);
914 if(p
->upperCase
!=0) {
915 slots
[count
]=(uint32_t)p
->upperCase
;
916 slotBits
|=slots
[count
];
918 excWord
|=U_MASK(UCASE_EXC_UPPER
);
920 if(p
->upperCase
!=p
->titleCase
) {
921 if(p
->titleCase
!=0) {
922 slots
[count
]=(uint32_t)p
->titleCase
;
924 slots
[count
]=(uint32_t)p
->code
;
926 slotBits
|=slots
[count
];
928 excWord
|=U_MASK(UCASE_EXC_TITLE
);
931 /* length of case closure */
932 if(p
->closure
[0]!=0) {
933 length
=getLengthOfCodePoints(p
->closure
, LENGTHOF(p
->closure
));
934 slots
[count
]=(uint32_t)length
; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
935 slotBits
|=slots
[count
];
937 excWord
|=U_MASK(UCASE_EXC_CLOSURE
);
940 /* lengths of full case mapping strings, stored in the last slot */
942 if(p
->specialCasing
!=NULL
) {
943 fullLengths
=p
->specialCasing
->lowerCase
[0];
944 fullLengths
|=p
->specialCasing
->upperCase
[0]<<8;
945 fullLengths
|=p
->specialCasing
->titleCase
[0]<<12;
947 if(p
->caseFolding
!=NULL
) {
948 fullLengths
|=p
->caseFolding
->full
[0]<<4;
951 slots
[count
]=fullLengths
;
952 slotBits
|=slots
[count
];
954 excWord
|=U_MASK(UCASE_EXC_FULL_MAPPINGS
);
958 doubleSlots
=(UBool
)(slotBits
>0xffff);
960 for(i
=0; i
<count
; ++i
) {
961 exceptions
[excTop
++]=(uint16_t)slots
[i
];
964 excWord
|=UCASE_EXC_DOUBLE_SLOTS
;
965 for(i
=0; i
<count
; ++i
) {
966 exceptions
[excTop
++]=(uint16_t)(slots
[i
]>>16);
967 exceptions
[excTop
++]=(uint16_t)slots
[i
];
971 /* write the full case mapping strings */
972 if(p
->specialCasing
!=NULL
) {
973 length
=(uint16_t)p
->specialCasing
->lowerCase
[0];
974 u_memcpy((UChar
*)exceptions
+excTop
, p
->specialCasing
->lowerCase
+1, length
);
977 if(p
->caseFolding
!=NULL
) {
978 length
=(uint16_t)p
->caseFolding
->full
[0];
979 u_memcpy((UChar
*)exceptions
+excTop
, p
->caseFolding
->full
+1, length
);
982 if(p
->specialCasing
!=NULL
) {
983 length
=(uint16_t)p
->specialCasing
->upperCase
[0];
984 u_memcpy((UChar
*)exceptions
+excTop
, p
->specialCasing
->upperCase
+1, length
);
987 length
=(uint16_t)p
->specialCasing
->titleCase
[0];
988 u_memcpy((UChar
*)exceptions
+excTop
, p
->specialCasing
->titleCase
+1, length
);
992 /* write the closure data */
993 if(p
->closure
[0]!=0) {
996 for(i
=0; i
<LENGTHOF(p
->closure
) && (c
=p
->closure
[i
])!=0; ++i
) {
997 U16_APPEND_UNSAFE((UChar
*)exceptions
, excTop
, c
);
1001 exceptionsTop
=excTop
;
1003 /* write the main exceptions word */
1004 exceptions
[excIndex
]=excWord
;
1017 while((row
=upvec_getRow(pv
, i
, NULL
, NULL
))!=NULL
) {
1019 if(value
&UCASE_EXCEPTION
) {
1020 excIndex
=makeException(value
, excProps
+(value
>>UGENCASE_EXC_SHIFT
));
1021 *row
=(value
&~(UGENCASE_EXC_MASK
|UCASE_EXC_MASK
))|(excIndex
<<UCASE_EXC_SHIFT
);
1027 /* generate output data ----------------------------------------------------- */
1030 generateData(const char *dataDir
, UBool csource
) {
1031 static int32_t indexes
[UCASE_IX_TOP
]={
1034 static uint8_t trieBlock
[40000];
1036 const uint32_t *row
;
1037 UChar32 start
, limit
;
1040 UNewDataMemory
*pData
;
1042 UErrorCode errorCode
=U_ZERO_ERROR
;
1046 pTrie
=utrie_open(NULL
, NULL
, 20000, 0, 0, TRUE
);
1048 fprintf(stderr
, "gencase error: unable to create a UNewTrie\n");
1049 exit(U_MEMORY_ALLOCATION_ERROR
);
1052 for(i
=0; (row
=upvec_getRow(pv
, i
, &start
, &limit
))!=NULL
; ++i
) {
1053 if(!utrie_setRange32(pTrie
, start
, limit
, *row
, TRUE
)) {
1054 fprintf(stderr
, "gencase error: unable to set trie value (overflow)\n");
1055 exit(U_BUFFER_OVERFLOW_ERROR
);
1059 trieSize
=utrie_serialize(pTrie
, trieBlock
, sizeof(trieBlock
), NULL
, TRUE
, &errorCode
);
1060 if(U_FAILURE(errorCode
)) {
1061 fprintf(stderr
, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode
), (long)trieSize
);
1065 indexes
[UCASE_IX_EXC_LENGTH
]=exceptionsTop
;
1066 indexes
[UCASE_IX_TRIE_SIZE
]=trieSize
;
1067 indexes
[UCASE_IX_UNFOLD_LENGTH
]=unfoldTop
;
1068 indexes
[UCASE_IX_LENGTH
]=(int32_t)sizeof(indexes
)+trieSize
+2*exceptionsTop
+2*unfoldTop
;
1070 indexes
[UCASE_IX_MAX_FULL_LENGTH
]=maxFullLength
;
1073 printf("trie size in bytes: %5d\n", (int)trieSize
);
1074 printf("number of code points with exceptions: %5d\n", exceptionsCount
);
1075 printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop
);
1076 printf("size in bytes of reverse foldings: %5d\n", 2*unfoldTop
);
1077 printf("data size: %5d\n", (int)indexes
[UCASE_IX_LENGTH
]);
1081 /* write .c file for hardcoded data */
1082 UTrie trie
={ NULL
};
1085 utrie_unserialize(&trie
, trieBlock
, trieSize
, &errorCode
);
1086 if(U_FAILURE(errorCode
)) {
1089 "gencase error: failed to utrie_unserialize(ucase.icu trie) - %s\n",
1090 u_errorName(errorCode
));
1094 f
=usrc_create(dataDir
, "ucase_props_data.c");
1097 "static const UVersionInfo ucase_props_dataVersion={",
1098 dataInfo
.dataVersion
, 8, 4,
1101 "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={",
1102 indexes
, 32, UCASE_IX_TOP
,
1104 usrc_writeUTrieArrays(f
,
1105 "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL
,
1109 "static const uint16_t ucase_props_exceptions[%ld]={\n",
1110 exceptions
, 16, exceptionsTop
,
1113 "static const uint16_t ucase_props_unfold[%ld]={\n",
1114 unfold
, 16, unfoldTop
,
1117 "static const UCaseProps ucase_props_singleton={\n"
1119 " ucase_props_indexes,\n"
1120 " ucase_props_exceptions,\n"
1121 " ucase_props_unfold,\n",
1123 usrc_writeUTrieStruct(f
,
1125 &trie
, "ucase_props_trieIndex", NULL
, NULL
,
1127 usrc_writeArray(f
, " { ", dataInfo
.formatVersion
, 8, 4, " }\n");
1132 /* write the data */
1133 pData
=udata_create(dataDir
, UCASE_DATA_TYPE
, UCASE_DATA_NAME
, &dataInfo
,
1134 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, &errorCode
);
1135 if(U_FAILURE(errorCode
)) {
1136 fprintf(stderr
, "gencase: unable to create data memory, %s\n", u_errorName(errorCode
));
1140 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
1141 udata_writeBlock(pData
, trieBlock
, trieSize
);
1142 udata_writeBlock(pData
, exceptions
, 2*exceptionsTop
);
1143 udata_writeBlock(pData
, unfold
, 2*unfoldTop
);
1146 dataLength
=udata_finish(pData
, &errorCode
);
1147 if(U_FAILURE(errorCode
)) {
1148 fprintf(stderr
, "gencase: error %d writing the output file\n", errorCode
);
1152 if(dataLength
!=indexes
[UCASE_IX_LENGTH
]) {
1153 fprintf(stderr
, "gencase: data length %ld != calculated size %d\n",
1154 dataLength
, (int)indexes
[UCASE_IX_LENGTH
]);
1155 exit(U_INTERNAL_PROGRAM_ERROR
);
1163 * Hey, Emacs, please set the following:
1166 * indent-tabs-mode: nil