]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/gencase/store.c
ICU-400.38.tar.gz
[apple/icu.git] / icuSources / tools / gencase / store.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: store.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
15 *
16 * Store Unicode case mapping properties efficiently for
17 * random access.
18 */
19
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/ustring.h"
25 #include "cmemory.h"
26 #include "cstring.h"
27 #include "filestrm.h"
28 #include "utrie.h"
29 #include "uarrsort.h"
30 #include "unicode/udata.h"
31 #include "unewdata.h"
32 #include "propsvec.h"
33 #include "writesrc.h"
34 #include "gencase.h"
35
36 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
37
38 /* Unicode case mapping properties file format ---------------------------------
39
40 The file format prepared and written here contains several data
41 structures that store indexes or data.
42
43 Before the data contents described below, there are the headers required by
44 the udata API for loading ICU data. Especially, a UDataInfo structure
45 precedes the actual data. It contains platform properties values and the
46 file format version.
47
48 The following is a description of format version 1.1 .
49
50 Format version 1.1 adds data for case closure.
51
52 The file contains the following structures:
53
54 const int32_t indexes[i0] with values i0, i1, ...:
55 (see UCASE_IX_... constants for names of indexes)
56
57 i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
58 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
59 i2 trieSize; -- size in bytes of the case mapping properties trie
60 i3 exceptionsLength; -- length in uint16_t of the exceptions array
61 i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
62
63 i5..i14 reservedIndexes; -- reserved values; 0 for now
64
65 i15 maxFullLength; -- maximum length of a full case mapping/folding string
66
67
68 Serialized trie, see utrie.h;
69
70 const uint16_t exceptions[exceptionsLength];
71
72 const UChar unfold[unfoldLength];
73
74
75 Trie data word:
76 Bits
77 if(exception) {
78 15..4 unsigned exception index
79 } else {
80 if(not uncased) {
81 15..6 signed delta to simple case mapping code point
82 (add delta to input code point)
83 } else {
84 6 the code point is case-ignorable
85 (U+0307 is also case-ignorable but has an exception)
86 }
87 5..4 0 normal character with cc=0
88 1 soft-dotted character
89 2 cc=230
90 3 other cc
91 }
92 3 exception
93 2 case sensitive
94 1..0 0 uncased
95 1 lowercase
96 2 uppercase
97 3 titlecase
98
99
100 Exceptions:
101 A sub-array of the exceptions array is indexed by the exception index in a
102 trie word.
103 The sub-array consists of the following fields:
104 uint16_t excWord;
105 uint16_t optional values [];
106 UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
107
108 excWord: (see UCASE_EXC_...)
109 Bits
110 15 conditional case folding
111 14 conditional special casing
112 13..12 same as non-exception trie data bits 5..4
113 moved here because the exception index needs more bits than the delta
114 0 normal character with cc=0
115 1 soft-dotted character
116 2 cc=230
117 3 other cc
118 11.. 9 reserved
119 8 if set, then for each optional-value slot there are 2 uint16_t values
120 (high and low parts of 32-bit values)
121 instead of single ones
122 7.. 0 bits for which optional value is present
123
124 Optional-value slots:
125 0 lowercase mapping (code point)
126 1 case folding (code point)
127 2 uppercase mapping (code point)
128 3 titlecase mapping (code point)
129 4 reserved
130 5 reserved
131 6 closure mappings (new in format version 1.1)
132 7 there is at least one full (string) case mapping
133 the length of each is encoded in a nibble of this optional value,
134 and the strings follow this optional value in the same order:
135 lower/fold/upper/title
136
137 The optional closure mappings value is used as follows:
138 Bits 0..3 contain the length of a string of code points for case closure.
139 The string immediately follows the full case mappings, or the closure value
140 slot if there are no full case mappings.
141 Bits 4..15 are reserved and could be used in the future to indicate the
142 number of strings for case closure.
143 Complete case closure for a code point is given by the union of all simple
144 and full case mappings and foldings, plus the case closure code points
145 (and potentially, in the future, case closure strings).
146
147 For space saving, some values are not stored. Lookups are as follows:
148 - If special casing is conditional, then no full lower/upper/title mapping
149 strings are stored.
150 - If case folding is conditional, then no simple or full case foldings are
151 stored.
152 - Fall back in this order:
153 full (string) mapping -- if full mappings are used
154 simple (code point) mapping of the same type
155 simple fold->simple lower
156 simple title->simple upper
157 finally, the original code point (no mapping)
158
159 This fallback order is strict:
160 In particular, the fallback from full case folding is to simple case folding,
161 not to full lowercase mapping.
162
163 Reverse case folding data ("unfold") array: (new in format version 1.1)
164
165 This array stores some miscellaneous values followed by a table. The data maps
166 back from multi-character strings to their original code points, for use
167 in case closure.
168
169 The table contains two columns of strings.
170 The string in the first column is the case folding of each of the code points
171 in the second column. The strings are terminated with NUL or by the end of the
172 column, whichever comes first.
173
174 The miscellaneous data takes up one pseudo-row and includes:
175 - number of rows
176 - number of UChars per row
177 - number of UChars in the left (folding string) column
178
179 The table is sorted by its first column. Values in the first column are unique.
180
181 ----------------------------------------------------------------------------- */
182
183 /* UDataInfo cf. udata.h */
184 static UDataInfo dataInfo={
185 sizeof(UDataInfo),
186 0,
187
188 U_IS_BIG_ENDIAN,
189 U_CHARSET_FAMILY,
190 U_SIZEOF_UCHAR,
191 0,
192
193 /* dataFormat="cAsE" */
194 { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
195 { 1, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
196 { 4, 0, 1, 0 } /* dataVersion */
197 };
198
199 enum {
200 /* maximum number of exceptions expected */
201 MAX_EXC_COUNT=1000
202 };
203
204 /* exceptions values */
205 static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100];
206 static uint16_t exceptionsTop=0;
207 static Props excProps[MAX_EXC_COUNT];
208 static uint16_t exceptionsCount=0;
209
210 /* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
211 static int32_t maxFullLength=U16_MAX_LENGTH;
212
213 /* reverse case folding ("unfold") data */
214 static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={
215 0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0
216 };
217 static uint16_t unfoldRows=0;
218 static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH;
219
220 /* Unicode versions --------------------------------------------------------- */
221
222 static const UVersionInfo
223 unicodeVersions[]={
224 { 1, 0, 0, 0 },
225 { 1, 1, 0, 0 },
226 { 2, 0, 0, 0 },
227 { 3, 0, 0, 0 },
228 { 3, 1, 0, 0 },
229 { 3, 2, 0, 0 },
230 { 4, 0, 0, 0 },
231 { 4, 0, 1, 0 },
232 { 4, 1, 0, 0 }
233 };
234
235 int32_t ucdVersion=UNI_4_1;
236
237 static int32_t
238 findUnicodeVersion(const UVersionInfo version) {
239 int32_t i;
240
241 for(i=0; /* while(version>unicodeVersions[i]) {} */
242 i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
243 ++i) {}
244 if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
245 --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
246 }
247 return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
248 }
249
250 extern void
251 setUnicodeVersion(const char *v) {
252 UVersionInfo version;
253 u_versionFromString(version, v);
254 uprv_memcpy(dataInfo.dataVersion, version, 4);
255 ucdVersion=findUnicodeVersion(version);
256 }
257
258 /* -------------------------------------------------------------------------- */
259
260 static void
261 addUnfolding(UChar32 c, const UChar *s, int32_t length) {
262 int32_t i;
263
264 if(length>UGENCASE_UNFOLD_STRING_WIDTH) {
265 fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
266 (long)length, UGENCASE_UNFOLD_STRING_WIDTH);
267 exit(U_INTERNAL_PROGRAM_ERROR);
268 }
269 if(unfoldTop >= (LENGTHOF(unfold) - UGENCASE_UNFOLD_STRING_WIDTH)) {
270 fprintf(stderr, "gencase error: too many multi-character case foldings\n");
271 exit(U_BUFFER_OVERFLOW_ERROR);
272 }
273 u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH);
274 u_memcpy(unfold+unfoldTop, s, length);
275
276 i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH;
277 U16_APPEND_UNSAFE(unfold, i, c);
278
279 ++unfoldRows;
280 unfoldTop+=UGENCASE_UNFOLD_WIDTH;
281 }
282
283 /* store a character's properties ------------------------------------------- */
284
285 extern void
286 setProps(Props *p) {
287 UErrorCode errorCode;
288 uint32_t value, oldValue;
289 int32_t delta;
290 UBool isCaseIgnorable;
291
292 /* get the non-UnicodeData.txt properties */
293 value=oldValue=upvec_getValue(pv, p->code, 0);
294
295 /* default: map to self */
296 delta=0;
297
298 if(p->gc==U_TITLECASE_LETTER) {
299 /* the Titlecase property is read late, from UnicodeData.txt */
300 value|=UCASE_TITLE;
301 }
302
303 if(p->upperCase!=0) {
304 /* uppercase mapping as delta if the character is lowercase */
305 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
306 delta=p->upperCase-p->code;
307 } else {
308 value|=UCASE_EXCEPTION;
309 }
310 }
311 if(p->lowerCase!=0) {
312 /* lowercase mapping as delta if the character is uppercase or titlecase */
313 if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) {
314 delta=p->lowerCase-p->code;
315 } else {
316 value|=UCASE_EXCEPTION;
317 }
318 }
319 if(p->upperCase!=p->titleCase) {
320 value|=UCASE_EXCEPTION;
321 }
322 if(p->closure[0]!=0) {
323 value|=UCASE_EXCEPTION;
324 }
325 if(p->specialCasing!=NULL) {
326 value|=UCASE_EXCEPTION;
327 }
328 if(p->caseFolding!=NULL) {
329 value|=UCASE_EXCEPTION;
330 }
331
332 if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
333 value|=UCASE_EXCEPTION;
334 }
335
336 if(p->cc!=0) {
337 if(value&UCASE_DOT_MASK) {
338 fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n");
339 exit(U_INTERNAL_PROGRAM_ERROR);
340 }
341 if(p->cc==230) {
342 value|=UCASE_ABOVE;
343 } else {
344 value|=UCASE_OTHER_ACCENT;
345 }
346 }
347
348 /* encode case-ignorable as delta==1 on uncased characters */
349 isCaseIgnorable=FALSE;
350 if((value&UCASE_TYPE_MASK)==UCASE_NONE) {
351 if(ucdVersion>=UNI_4_1) {
352 /*
353 * Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk
354 * Unicode 5.1 and up: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk
355 * The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet.
356 */
357 if(
358 (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
359 (upvec_getValue(pv, p->code, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT))!=0
360 ) {
361 isCaseIgnorable=TRUE;
362 }
363 } else {
364 /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */
365 if(
366 (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
367 p->code==0x27 || p->code==0xad || p->code==0x2019
368 ) {
369 isCaseIgnorable=TRUE;
370 }
371 }
372 }
373
374 if(isCaseIgnorable && p->code!=0x307) {
375 /*
376 * We use one of the delta/exception bits, which works because we only
377 * store the case-ignorable flag for uncased characters.
378 * There is no delta for uncased characters (see checks above).
379 * If there is an exception for an uncased, case-ignorable character
380 * (although there should not be any case mappings if it's uncased)
381 * then we have a problem.
382 * There is one character which is case-ignorable but has an exception:
383 * U+0307 is uncased, Mn, has conditional special casing and
384 * is therefore handled in code instead.
385 */
386 if(value&UCASE_EXCEPTION) {
387 fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
388 (unsigned long)p->code);
389 exit(U_INTERNAL_PROGRAM_ERROR);
390 }
391
392 delta=1;
393 }
394
395 /* handle exceptions */
396 if(value&UCASE_EXCEPTION) {
397 /* simply store exceptions for later processing and encoding */
398 value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
399 uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p));
400 if(++exceptionsCount==MAX_EXC_COUNT) {
401 fprintf(stderr, "gencase: too many exceptions\n");
402 exit(U_INDEX_OUTOFBOUNDS_ERROR);
403 }
404 } else {
405 /* store the simple case mapping delta */
406 value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
407 }
408
409 errorCode=U_ZERO_ERROR;
410 if( value!=oldValue &&
411 !upvec_setValue(pv, p->code, p->code+1, 0, value, 0xffffffff, &errorCode)
412 ) {
413 fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
414 u_errorName(errorCode));
415 exit(errorCode);
416 }
417
418 /* add the multi-character case folding to the "unfold" data */
419 if(p->caseFolding!=NULL) {
420 int32_t length=p->caseFolding->full[0];
421 if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) {
422 addUnfolding(p->code, p->caseFolding->full+1, length);
423 }
424 }
425 }
426
427 extern void
428 addCaseSensitive(UChar32 first, UChar32 last) {
429 UErrorCode errorCode=U_ZERO_ERROR;
430 if(!upvec_setValue(pv, first, last+1, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode)) {
431 fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
432 u_errorName(errorCode));
433 exit(errorCode);
434 }
435 }
436
437 /* finalize reverse case folding ("unfold") data ---------------------------- */
438
439 static int32_t U_CALLCONV
440 compareUnfold(const void *context, const void *left, const void *right) {
441 return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH);
442 }
443
444 static void
445 makeUnfoldData() {
446 static const UChar
447 iDot[2]= { 0x69, 0x307 };
448
449 UChar *p, *q;
450 int32_t i, j, k;
451 UErrorCode errorCode;
452
453 /*
454 * add a case folding that we missed because it's conditional:
455 * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
456 */
457 addUnfolding(0x130, iDot, 2);
458
459 /* sort the data */
460 errorCode=U_ZERO_ERROR;
461 uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2,
462 compareUnfold, NULL, FALSE, &errorCode);
463
464 /* make unique-string rows by merging adjacent ones' code point columns */
465
466 /* make p point to row i-1 */
467 p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH;
468
469 for(i=1; i<unfoldRows;) {
470 if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) {
471 /* concatenate code point columns */
472 q=p+UGENCASE_UNFOLD_STRING_WIDTH;
473 for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {}
474 for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) {
475 q[j]=q[UGENCASE_UNFOLD_WIDTH+k];
476 }
477 if(j>UGENCASE_UNFOLD_CP_WIDTH) {
478 fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
479 (long)j, UGENCASE_UNFOLD_CP_WIDTH);
480 exit(U_BUFFER_OVERFLOW_ERROR);
481 }
482
483 /* move following rows up one */
484 --unfoldRows;
485 unfoldTop-=UGENCASE_UNFOLD_WIDTH;
486 u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH);
487 } else {
488 p+=UGENCASE_UNFOLD_WIDTH;
489 ++i;
490 }
491 }
492
493 unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows;
494
495 if(beVerbose) {
496 puts("unfold data:");
497
498 p=(UChar *)unfold;
499 for(i=0; i<unfoldRows; ++i) {
500 p+=UGENCASE_UNFOLD_WIDTH;
501 printf("[%2d] %04x %04x %04x <- %04x %04x\n",
502 (int)i, p[0], p[1], p[2], p[3], p[4]);
503 }
504 }
505 }
506
507 /* case closure ------------------------------------------------------------- */
508
509 static void
510 addClosureMapping(UChar32 src, UChar32 dest) {
511 uint32_t value;
512
513 if(beVerbose) {
514 printf("add closure mapping U+%04lx->U+%04lx\n",
515 (unsigned long)src, (unsigned long)dest);
516 }
517
518 value=upvec_getValue(pv, src, 0);
519 if(value&UCASE_EXCEPTION) {
520 Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
521 int32_t i;
522
523 /* append dest to src's closure array */
524 for(i=0;; ++i) {
525 if(i==LENGTHOF(p->closure)) {
526 fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n",
527 (unsigned long)src, (unsigned long)dest);
528 exit(U_BUFFER_OVERFLOW_ERROR);
529 } else if(p->closure[i]==dest) {
530 break; /* do not store duplicates */
531 } else if(p->closure[i]==0) {
532 p->closure[i]=dest;
533 break;
534 }
535 }
536 } else {
537 Props p2={ 0 };
538 UChar32 next;
539 UErrorCode errorCode;
540
541 /*
542 * decode value into p2 (enough for makeException() to work properly),
543 * add the closure mapping,
544 * and set the new exception for src
545 */
546 p2.code=src;
547 p2.closure[0]=dest;
548
549 if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
550 /* one simple case mapping, don't care which one */
551 next=src+((int16_t)value>>UCASE_DELTA_SHIFT);
552 if(next!=src) {
553 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
554 p2.upperCase=p2.titleCase=next;
555 } else {
556 p2.lowerCase=next;
557 }
558 }
559 } else if(value&UCASE_DELTA_MASK) {
560 fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n",
561 (unsigned long)src);
562 exit(U_INTERNAL_PROGRAM_ERROR);
563 }
564
565 value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */
566 value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
567 value|=UCASE_EXCEPTION;
568 uprv_memcpy(excProps+exceptionsCount, &p2, sizeof(p2));
569 if(++exceptionsCount==MAX_EXC_COUNT) {
570 fprintf(stderr, "gencase: too many exceptions\n");
571 exit(U_INDEX_OUTOFBOUNDS_ERROR);
572 }
573
574 errorCode=U_ZERO_ERROR;
575 if(!upvec_setValue(pv, src, src+1, 0, value, 0xffffffff, &errorCode)) {
576 fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
577 u_errorName(errorCode));
578 exit(errorCode);
579 }
580 }
581 }
582
583 /*
584 * Find missing case mapping relationships and add mappings for case closure.
585 * This function starts from an "original" code point and recursively
586 * finds its case mappings and the case mappings of where it maps to.
587 *
588 * The recursion depth is capped at 3 nested calls of this function.
589 * In each call, the current code point is c, and the function enumerates
590 * all of c's simple (single-code point) case mappings.
591 * prev is the code point that case-mapped to c.
592 * prev2 is the code point that case-mapped to prev.
593 *
594 * The initial function call has prev2<0, prev<0, and c==orig
595 * (marking no code points).
596 * It enumerates c's case mappings and recurses without further action.
597 *
598 * The second-level function call has prev2<0, prev==orig, and c is
599 * the destination code point of one of prev's case mappings.
600 * The function checks if any of c's case mappings go back to orig
601 * and adds a closure mapping if not.
602 * In other words, it turns a case mapping relationship of
603 * orig->c
604 * into
605 * orig<->c
606 *
607 * The third-level function call has prev2==orig, prev>=0, and c is
608 * the destination code point of one of prev's case mappings.
609 * (And prev is the destination of one of prev2's case mappings.)
610 * The function checks if any of c's case mappings go back to orig
611 * and adds a closure mapping if not.
612 * In other words, it turns case mapping relationships of
613 * orig->prev->c or orig->prev<->c
614 * into
615 * orig->prev->c->orig or orig->prev<->c->orig
616 * etc.
617 * (Graphically, this closes a triangle.)
618 *
619 * With repeated application on all code points until no more closure mappings
620 * are added, all case equivalence groups get complete mappings.
621 * That is, in each group of code points with case relationships
622 * each code point will in the end have some mapping to each other
623 * code point in the group.
624 *
625 * @return TRUE if a closure mapping was added
626 */
627 static UBool
628 addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) {
629 UChar32 next;
630 UBool someMappingsAdded=FALSE;
631
632 if(c!=orig) {
633 /* get the properties for c */
634 value=upvec_getValue(pv, c, 0);
635 }
636 /* else if c==orig then c's value was passed in */
637
638 if(value&UCASE_EXCEPTION) {
639 UChar32 set[32];
640 int32_t i, count=0;
641
642 Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
643
644 /*
645 * marker for whether any of c's mappings goes to orig
646 * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
647 */
648 UBool mapsToOrig=(UBool)(c==orig);
649
650 /* collect c's case mapping destinations in set[] */
651 if((next=p->upperCase)!=0 && next!=c) {
652 set[count++]=next;
653 }
654 if((next=p->lowerCase)!=0 && next!=c) {
655 set[count++]=next;
656 }
657 if(p->upperCase!=(next=p->titleCase) && next!=c) {
658 set[count++]=next;
659 }
660 if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) {
661 set[count++]=next;
662 }
663
664 /* append c's current closure mappings to set[] */
665 for(i=0; i<LENGTHOF(p->closure) && (next=p->closure[i])!=0; ++i) {
666 set[count++]=next;
667 }
668
669 /* process all code points to which c case-maps */
670 for(i=0; i<count; ++i) {
671 next=set[i]; /* next!=c */
672
673 if(next==orig) {
674 mapsToOrig=TRUE; /* remember that we map to orig */
675 } else if(prev2<0 && next!=prev) {
676 /*
677 * recurse unless
678 * we have reached maximum depth (prev2>=0) or
679 * this is a mapping to one of the previous code points (orig, prev, c)
680 */
681 someMappingsAdded|=addClosure(orig, prev, c, next, 0);
682 }
683 }
684
685 if(!mapsToOrig) {
686 addClosureMapping(c, orig);
687 return TRUE;
688 }
689 } else {
690 if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
691 /* one simple case mapping, don't care which one */
692 next=c+((int16_t)value>>UCASE_DELTA_SHIFT);
693 if(next!=c) {
694 /*
695 * recurse unless
696 * we have reached maximum depth (prev2>=0) or
697 * this is a mapping to one of the previous code points (orig, prev, c)
698 */
699 if(prev2<0 && next!=orig && next!=prev) {
700 someMappingsAdded|=addClosure(orig, prev, c, next, 0);
701 }
702
703 if(c!=orig && next!=orig) {
704 /* c does not map to orig, add a closure mapping c->orig */
705 addClosureMapping(c, orig);
706 return TRUE;
707 }
708 }
709 }
710 }
711
712 return someMappingsAdded;
713 }
714
715 extern void
716 makeCaseClosure() {
717 UChar *p;
718 uint32_t *row;
719 uint32_t value;
720 UChar32 start, limit, c, c2;
721 int32_t i, j;
722 UBool someMappingsAdded;
723
724 /*
725 * finalize the "unfold" data because we need to use it to add closure mappings
726 * for situations like FB05->"st"<-FB06
727 * where we would otherwise miss the FB05<->FB06 relationship
728 */
729 makeUnfoldData();
730
731 /* use the "unfold" data to add mappings */
732
733 /* p always points to the code points; this loop ignores the strings completely */
734 p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH;
735
736 for(i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) {
737 j=0;
738 U16_NEXT_UNSAFE(p, j, c);
739 while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) {
740 U16_NEXT_UNSAFE(p, j, c2);
741 addClosure(c, U_SENTINEL, c, c2, 0);
742 }
743 }
744
745 if(beVerbose) {
746 puts("---- ---- ---- ---- (done with closures from unfolding)");
747 }
748
749 /* add further closure mappings from analyzing simple mappings */
750 do {
751 someMappingsAdded=FALSE;
752
753 i=0;
754 while((row=upvec_getRow(pv, i, &start, &limit))!=NULL) {
755 value=*row;
756 if(value!=0) {
757 while(start<limit) {
758 if(addClosure(start, U_SENTINEL, U_SENTINEL, start, value)) {
759 someMappingsAdded=TRUE;
760
761 /*
762 * stop this loop because pv was changed and row is not valid any more
763 * skip all rows below the current start
764 */
765 while((row=upvec_getRow(pv, i, NULL, &limit))!=NULL && start>=limit) {
766 ++i;
767 }
768 row=NULL; /* signal to continue with outer loop, without further ++i */
769 break;
770 }
771 ++start;
772 }
773 if(row==NULL) {
774 continue; /* see row=NULL above */
775 }
776 }
777 ++i;
778 }
779
780 if(beVerbose && someMappingsAdded) {
781 puts("---- ---- ---- ----");
782 }
783 } while(someMappingsAdded);
784 }
785
786 /* exceptions --------------------------------------------------------------- */
787
788 /* get the string length from zero-terminated code points in a limited-length array */
789 static int32_t
790 getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) {
791 int32_t i, length;
792
793 for(i=length=0; i<maxLength && s[i]!=0; ++i) {
794 length+=U16_LENGTH(s[i]);
795 }
796 return length;
797 }
798
799 static UBool
800 fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) {
801 int32_t i, length;
802 UChar32 full;
803
804 length=*s++;
805 if(length==0 || length>U16_MAX_LENGTH) {
806 return FALSE;
807 }
808 i=0;
809 U16_NEXT(s, i, length, full);
810
811 if(simple==0) {
812 simple=c; /* UCD has no simple mapping if it's the same as the code point itself */
813 }
814 return (UBool)(i==length && full==simple);
815 }
816
817 static uint16_t
818 makeException(uint32_t value, Props *p) {
819 uint32_t slots[8];
820 uint32_t slotBits;
821 uint16_t excWord, excIndex, excTop, i, count, length, fullLengths;
822 UBool doubleSlots;
823
824 /* excIndex will be returned for storing in the trie word */
825 excIndex=exceptionsTop;
826 if(excIndex>=UCASE_MAX_EXCEPTIONS) {
827 fprintf(stderr, "gencase error: too many exceptions words\n");
828 exit(U_BUFFER_OVERFLOW_ERROR);
829 }
830
831 excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */
832
833 /* copy and shift the soft-dotted bits */
834 excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT;
835
836 /* update maxFullLength */
837 if(p->specialCasing!=NULL) {
838 length=p->specialCasing->lowerCase[0];
839 if(length>maxFullLength) {
840 maxFullLength=length;
841 }
842 length=p->specialCasing->upperCase[0];
843 if(length>maxFullLength) {
844 maxFullLength=length;
845 }
846 length=p->specialCasing->titleCase[0];
847 if(length>maxFullLength) {
848 maxFullLength=length;
849 }
850 }
851 if(p->caseFolding!=NULL) {
852 length=p->caseFolding->full[0];
853 if(length>maxFullLength) {
854 maxFullLength=length;
855 }
856 }
857
858 /* set the bits for conditional mappings */
859 if(p->specialCasing!=NULL && p->specialCasing->isComplex) {
860 excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
861 p->specialCasing=NULL;
862 }
863 if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
864 excWord|=UCASE_EXC_CONDITIONAL_FOLD;
865 p->caseFolding=NULL;
866 }
867
868 /*
869 * Note:
870 * UCD stores no simple mappings when they are the same as the code point itself.
871 * SpecialCasing and CaseFolding do store simple mappings even if they are
872 * the same as the code point itself.
873 * Comparisons between simple regular mappings and simple special/folding
874 * mappings need to compensate for the difference by comparing with the
875 * original code point if a simple UCD mapping is missing (0).
876 */
877
878 /* remove redundant data */
879 if(p->specialCasing!=NULL) {
880 /* do not store full mappings if they are the same as the simple ones */
881 if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) {
882 p->specialCasing->lowerCase[0]=0;
883 }
884 if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) {
885 p->specialCasing->upperCase[0]=0;
886 }
887 if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) {
888 p->specialCasing->titleCase[0]=0;
889 }
890 }
891 if( p->caseFolding!=NULL &&
892 fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code)
893 ) {
894 p->caseFolding->full[0]=0;
895 }
896
897 /* write the optional slots */
898 slotBits=0;
899 count=0;
900
901 if(p->lowerCase!=0) {
902 slots[count]=(uint32_t)p->lowerCase;
903 slotBits|=slots[count];
904 ++count;
905 excWord|=U_MASK(UCASE_EXC_LOWER);
906 }
907 if( p->caseFolding!=NULL &&
908 p->caseFolding->simple!=0 &&
909 (p->lowerCase!=0 ?
910 p->caseFolding->simple!=p->lowerCase :
911 p->caseFolding->simple!=p->code)
912 ) {
913 slots[count]=(uint32_t)p->caseFolding->simple;
914 slotBits|=slots[count];
915 ++count;
916 excWord|=U_MASK(UCASE_EXC_FOLD);
917 }
918 if(p->upperCase!=0) {
919 slots[count]=(uint32_t)p->upperCase;
920 slotBits|=slots[count];
921 ++count;
922 excWord|=U_MASK(UCASE_EXC_UPPER);
923 }
924 if(p->upperCase!=p->titleCase) {
925 if(p->titleCase!=0) {
926 slots[count]=(uint32_t)p->titleCase;
927 } else {
928 slots[count]=(uint32_t)p->code;
929 }
930 slotBits|=slots[count];
931 ++count;
932 excWord|=U_MASK(UCASE_EXC_TITLE);
933 }
934
935 /* length of case closure */
936 if(p->closure[0]!=0) {
937 length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure));
938 slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
939 slotBits|=slots[count];
940 ++count;
941 excWord|=U_MASK(UCASE_EXC_CLOSURE);
942 }
943
944 /* lengths of full case mapping strings, stored in the last slot */
945 fullLengths=0;
946 if(p->specialCasing!=NULL) {
947 fullLengths=p->specialCasing->lowerCase[0];
948 fullLengths|=p->specialCasing->upperCase[0]<<8;
949 fullLengths|=p->specialCasing->titleCase[0]<<12;
950 }
951 if(p->caseFolding!=NULL) {
952 fullLengths|=p->caseFolding->full[0]<<4;
953 }
954 if(fullLengths!=0) {
955 slots[count]=fullLengths;
956 slotBits|=slots[count];
957 ++count;
958 excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS);
959 }
960
961 /* write slots */
962 doubleSlots=(UBool)(slotBits>0xffff);
963 if(!doubleSlots) {
964 for(i=0; i<count; ++i) {
965 exceptions[excTop++]=(uint16_t)slots[i];
966 }
967 } else {
968 excWord|=UCASE_EXC_DOUBLE_SLOTS;
969 for(i=0; i<count; ++i) {
970 exceptions[excTop++]=(uint16_t)(slots[i]>>16);
971 exceptions[excTop++]=(uint16_t)slots[i];
972 }
973 }
974
975 /* write the full case mapping strings */
976 if(p->specialCasing!=NULL) {
977 length=(uint16_t)p->specialCasing->lowerCase[0];
978 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length);
979 excTop+=length;
980 }
981 if(p->caseFolding!=NULL) {
982 length=(uint16_t)p->caseFolding->full[0];
983 u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length);
984 excTop+=length;
985 }
986 if(p->specialCasing!=NULL) {
987 length=(uint16_t)p->specialCasing->upperCase[0];
988 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length);
989 excTop+=length;
990
991 length=(uint16_t)p->specialCasing->titleCase[0];
992 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length);
993 excTop+=length;
994 }
995
996 /* write the closure data */
997 if(p->closure[0]!=0) {
998 UChar32 c;
999
1000 for(i=0; i<LENGTHOF(p->closure) && (c=p->closure[i])!=0; ++i) {
1001 U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c);
1002 }
1003 }
1004
1005 exceptionsTop=excTop;
1006
1007 /* write the main exceptions word */
1008 exceptions[excIndex]=excWord;
1009
1010 return excIndex;
1011 }
1012
1013 extern void
1014 makeExceptions() {
1015 uint32_t *row;
1016 uint32_t value;
1017 int32_t i;
1018 uint16_t excIndex;
1019
1020 i=0;
1021 while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) {
1022 value=*row;
1023 if(value&UCASE_EXCEPTION) {
1024 excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT));
1025 *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT);
1026 }
1027 ++i;
1028 }
1029 }
1030
1031 /* generate output data ----------------------------------------------------- */
1032
1033 extern void
1034 generateData(const char *dataDir, UBool csource) {
1035 static int32_t indexes[UCASE_IX_TOP]={
1036 UCASE_IX_TOP
1037 };
1038 static uint8_t trieBlock[40000];
1039
1040 const uint32_t *row;
1041 UChar32 start, limit;
1042 int32_t i;
1043
1044 UNewDataMemory *pData;
1045 UNewTrie *pTrie;
1046 UErrorCode errorCode=U_ZERO_ERROR;
1047 int32_t trieSize;
1048 long dataLength;
1049
1050 pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
1051 if(pTrie==NULL) {
1052 fprintf(stderr, "gencase error: unable to create a UNewTrie\n");
1053 exit(U_MEMORY_ALLOCATION_ERROR);
1054 }
1055
1056 for(i=0; (row=upvec_getRow(pv, i, &start, &limit))!=NULL; ++i) {
1057 if(!utrie_setRange32(pTrie, start, limit, *row, TRUE)) {
1058 fprintf(stderr, "gencase error: unable to set trie value (overflow)\n");
1059 exit(U_BUFFER_OVERFLOW_ERROR);
1060 }
1061 }
1062
1063 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
1064 if(U_FAILURE(errorCode)) {
1065 fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
1066 exit(errorCode);
1067 }
1068
1069 indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop;
1070 indexes[UCASE_IX_TRIE_SIZE]=trieSize;
1071 indexes[UCASE_IX_UNFOLD_LENGTH]=unfoldTop;
1072 indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop+2*unfoldTop;
1073
1074 indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
1075
1076 if(beVerbose) {
1077 printf("trie size in bytes: %5d\n", (int)trieSize);
1078 printf("number of code points with exceptions: %5d\n", exceptionsCount);
1079 printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop);
1080 printf("size in bytes of reverse foldings: %5d\n", 2*unfoldTop);
1081 printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]);
1082 }
1083
1084 if(csource) {
1085 /* write .c file for hardcoded data */
1086 UTrie trie={ NULL };
1087 FILE *f;
1088
1089 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
1090 if(U_FAILURE(errorCode)) {
1091 fprintf(
1092 stderr,
1093 "gencase error: failed to utrie_unserialize(ucase.icu trie) - %s\n",
1094 u_errorName(errorCode));
1095 return;
1096 }
1097
1098 f=usrc_create(dataDir, "ucase_props_data.c");
1099 if(f!=NULL) {
1100 usrc_writeArray(f,
1101 "static const UVersionInfo ucase_props_dataVersion={",
1102 dataInfo.dataVersion, 8, 4,
1103 "};\n\n");
1104 usrc_writeArray(f,
1105 "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={",
1106 indexes, 32, UCASE_IX_TOP,
1107 "};\n\n");
1108 usrc_writeUTrieArrays(f,
1109 "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL,
1110 &trie,
1111 "\n};\n\n");
1112 usrc_writeArray(f,
1113 "static const uint16_t ucase_props_exceptions[%ld]={\n",
1114 exceptions, 16, exceptionsTop,
1115 "\n};\n\n");
1116 usrc_writeArray(f,
1117 "static const uint16_t ucase_props_unfold[%ld]={\n",
1118 unfold, 16, unfoldTop,
1119 "\n};\n\n");
1120 fputs(
1121 "static const UCaseProps ucase_props_singleton={\n"
1122 " NULL,\n"
1123 " ucase_props_indexes,\n"
1124 " ucase_props_exceptions,\n"
1125 " ucase_props_unfold,\n",
1126 f);
1127 usrc_writeUTrieStruct(f,
1128 " {\n",
1129 &trie, "ucase_props_trieIndex", NULL, NULL,
1130 " },\n");
1131 usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n");
1132 fputs("};\n", f);
1133 fclose(f);
1134 }
1135 } else {
1136 /* write the data */
1137 pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo,
1138 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1139 if(U_FAILURE(errorCode)) {
1140 fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode));
1141 exit(errorCode);
1142 }
1143
1144 udata_writeBlock(pData, indexes, sizeof(indexes));
1145 udata_writeBlock(pData, trieBlock, trieSize);
1146 udata_writeBlock(pData, exceptions, 2*exceptionsTop);
1147 udata_writeBlock(pData, unfold, 2*unfoldTop);
1148
1149 /* finish up */
1150 dataLength=udata_finish(pData, &errorCode);
1151 if(U_FAILURE(errorCode)) {
1152 fprintf(stderr, "gencase: error %d writing the output file\n", errorCode);
1153 exit(errorCode);
1154 }
1155
1156 if(dataLength!=indexes[UCASE_IX_LENGTH]) {
1157 fprintf(stderr, "gencase: data length %ld != calculated size %d\n",
1158 dataLength, (int)indexes[UCASE_IX_LENGTH]);
1159 exit(U_INTERNAL_PROGRAM_ERROR);
1160 }
1161 }
1162
1163 utrie_close(pTrie);
1164 }
1165
1166 /*
1167 * Hey, Emacs, please set the following:
1168 *
1169 * Local Variables:
1170 * indent-tabs-mode: nil
1171 * End:
1172 *
1173 */