]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gencase/store.c
ICU-6.2.14.tar.gz
[apple/icu.git] / icuSources / tools / gencase / store.c
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 2004, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: store.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2004aug28
14* created by: Markus W. Scherer
15*
16* Store Unicode case mapping properties efficiently for
17* random access.
18*/
19
20#include <stdio.h>
21#include <stdlib.h>
22#include "unicode/utypes.h"
23#include "unicode/uchar.h"
24#include "unicode/ustring.h"
25#include "cmemory.h"
26#include "cstring.h"
27#include "filestrm.h"
28#include "utrie.h"
29#include "unicode/udata.h"
30#include "unewdata.h"
31#include "propsvec.h"
32#include "gencase.h"
33
34/* Unicode case mapping properties file format ---------------------------------
35
36The file format prepared and written here contains several data
37structures that store indexes or data.
38
39Before the data contents described below, there are the headers required by
40the udata API for loading ICU data. Especially, a UDataInfo structure
41precedes the actual data. It contains platform properties values and the
42file format version.
43
44The following is a description of format version 1 .
45
46The file contains the following structures:
47
48 const int32_t indexes[i0] with values i0, i1, ...:
49 (see UCASE_IX_... constants for names of indexes)
50
51 i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
52 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
53 i2 trieSize; -- size in bytes of the case mapping properties trie
54 i3 exceptionsLength; -- length in uint16_t of the exceptions array
55
56 i4..i14 reservedIndexes; -- reserved values; 0 for now
57
58 i15 maxFullLength; -- maximum length of a full case mapping/folding string
59
60
61 Serizalied trie, see utrie.h;
62
63 const uint16_t exceptions[exceptionsLength];
64
65
66Trie data word:
67Bits
68if(exception) {
69 15..4 unsigned exception index
70} else {
71 if(not uncased) {
72 15..6 signed delta to simple case mapping code point
73 (add delta to input code point)
74 } else {
75 6 the code point is case-ignorable
76 (U+0307 is also case-ignorable but has an exception)
77 }
78 5..4 0 normal character with cc=0
79 1 soft-dotted character
80 2 cc=230
81 3 other cc
82}
83 3 exception
84 2 case sensitive
85 1..0 0 uncased
86 1 lowercase
87 2 uppercase
88 3 titlecase
89
90
91Exceptions:
92A sub-array of the exceptions array is indexed by the exception index in a
93trie word.
94The sub-array consists of the following fields:
95 uint16_t excWord;
96 uint16_t optional values [];
97 UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
98
99excWord: (see UCASE_EXC_...)
100Bits
101 15 conditional case folding
102 14 conditional special casing
10313..12 same as non-exception trie data bits 5..4
104 moved here because the exception index needs more bits than the delta
105 0 normal character with cc=0
106 1 soft-dotted character
107 2 cc=230
108 3 other cc
10911.. 9 reserved
110 8 if set, then for each optional-value slot there are 2 uint16_t values
111 (high and low parts of 32-bit values)
112 instead of single ones
113 7.. 0 bits for which optional value is present
114
115Optional-value slots:
1160 lowercase mapping (code point)
1171 case folding (code point)
1182 uppercase mapping (code point)
1193 titlecase mapping (code point)
1204..6 reserved
1217 there is at least one full (string) case mapping
122 the length of each is encoded in a nibble of this optional value,
123 and the strings follow this optional value in the same order:
124 lower/fold/upper/title
125
126For space saving, some values are not stored. Lookups are as follows:
127- If special casing is conditional, then no full lower/upper/title mapping
128 strings are stored.
129- If case folding is conditional, then no simple or full case foldings are
130 stored.
131- Fall back in this order:
132 full (string) mapping -- if full mappings are used
133 simple (code point) mapping of the same type
134 simple fold->simple lower
135 simple title->simple upper
136 finally, the original code point (no mapping)
137
138----------------------------------------------------------------------------- */
139
140/* UDataInfo cf. udata.h */
141static UDataInfo dataInfo={
142 sizeof(UDataInfo),
143 0,
144
145 U_IS_BIG_ENDIAN,
146 U_CHARSET_FAMILY,
147 U_SIZEOF_UCHAR,
148 0,
149
150 /* dataFormat="cAsE" */
151 { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
152 { 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
153 { 4, 0, 1, 0 } /* dataVersion */
154};
155
156enum {
157 /* maximum number of exceptions expected */
158 MAX_EXC_COUNT=1000
159};
160
161/* exceptions values */
162static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100];
163static uint16_t exceptionsTop=0;
164static Props excProps[MAX_EXC_COUNT];
165static uint16_t exceptionsCount=0;
166
167/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
168static int32_t maxFullLength=U16_MAX_LENGTH;
169
170/* -------------------------------------------------------------------------- */
171
172extern void
173setUnicodeVersion(const char *v) {
174 UVersionInfo version;
175 u_versionFromString(version, v);
176 uprv_memcpy(dataInfo.dataVersion, version, 4);
177}
178
179/* store a character's properties ------------------------------------------- */
180
181extern void
182setProps(Props *p) {
183 UErrorCode errorCode;
184 uint32_t value, oldValue;
185 int32_t delta;
186
187 /* get the non-UnicodeData.txt properties */
188 value=oldValue=upvec_getValue(pv, p->code, 0);
189
190 /* default: map to self */
191 delta=0;
192
193 if(p->gc==U_TITLECASE_LETTER) {
194 /* the Titlecase property is read late, from UnicodeData.txt */
195 value|=UCASE_TITLE;
196 }
197
198 if(p->upperCase!=0) {
199 /* uppercase mapping as delta if the character is lowercase */
200 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
201 delta=p->upperCase-p->code;
202 } else {
203 value|=UCASE_EXCEPTION;
204 }
205 }
206 if(p->lowerCase!=0) {
207 /* lowercase mapping as delta if the character is uppercase or titlecase */
208 if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) {
209 delta=p->lowerCase-p->code;
210 } else {
211 value|=UCASE_EXCEPTION;
212 }
213 }
214 if(p->upperCase!=p->titleCase) {
215 value|=UCASE_EXCEPTION;
216 }
217 if(p->specialCasing!=NULL) {
218 value|=UCASE_EXCEPTION;
219 }
220 if(p->caseFolding!=NULL) {
221 value|=UCASE_EXCEPTION;
222 }
223
224 if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
225 value|=UCASE_EXCEPTION;
226 }
227
228 if(p->cc!=0) {
229 if(value&UCASE_DOT_MASK) {
230 fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n");
231 exit(U_INTERNAL_PROGRAM_ERROR);
232 }
233 if(p->cc==230) {
234 value|=UCASE_ABOVE;
235 } else {
236 value|=UCASE_OTHER_ACCENT;
237 }
238 }
239
240 /* encode case-ignorable as delta==1 on uncased characters */
241 if(
242 (value&UCASE_TYPE_MASK)==UCASE_NONE &&
243 p->code!=0x307 &&
244 ((U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
245 p->code==0x27 || p->code==0xad || p->code==0x2019)
246 ) {
247 /*
248 * We use one of the delta/exception bits, which works because we only
249 * store the case-ignorable flag for uncased characters.
250 * There is no delta for uncased characters (see checks above).
251 * If there is an exception for an uncased, case-ignorable character
252 * (although there should not be any case mappings if it's uncased)
253 * then we have a problem.
254 * There is one character which is case-ignorable but has an exception:
255 * U+0307 is uncased, Mn, has conditional special casing and
256 * is therefore handled in code instead.
257 */
258 if(value&UCASE_EXCEPTION) {
259 fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
260 (unsigned long)p->code);
261 exit(U_INTERNAL_PROGRAM_ERROR);
262 }
263
264 delta=1;
265 }
266
267 /* handle exceptions */
268 if(value&UCASE_EXCEPTION) {
269 /* simply store exceptions for later processing and encoding */
270 value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
271 uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p));
272 if(++exceptionsCount==MAX_EXC_COUNT) {
273 fprintf(stderr, "gencase: too many exceptions\n");
274 exit(U_INDEX_OUTOFBOUNDS_ERROR);
275 }
276 } else {
277 /* store the simple case mapping delta */
278 value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
279 }
280
281 errorCode=U_ZERO_ERROR;
282 if( value!=oldValue &&
283 !upvec_setValue(pv, p->code, p->code+1, 0, value, 0xffffffff, &errorCode)
284 ) {
285 fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
286 u_errorName(errorCode));
287 exit(errorCode);
288 }
289}
290
291extern void
292addCaseSensitive(UChar32 first, UChar32 last) {
293 UErrorCode errorCode=U_ZERO_ERROR;
294 if(!upvec_setValue(pv, first, last+1, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode)) {
295 fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
296 u_errorName(errorCode));
297 exit(errorCode);
298 }
299}
300
301extern void
302makeCaseClosure() {
303 /* TODO */
304}
305
306/* exceptions --------------------------------------------------------------- */
307
308static UBool
309fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) {
310 int32_t i, length;
311 UChar32 full;
312
313 length=*s++;
314 if(length==0 || length>U16_MAX_LENGTH) {
315 return FALSE;
316 }
317 i=0;
318 U16_NEXT(s, i, length, full);
319
320 if(simple==0) {
321 simple=c; /* UCD has no simple mapping if it's the same as the code point itself */
322 }
323 return (UBool)(i==length && full==simple);
324}
325
326static uint16_t
327makeException(uint32_t value, Props *p) {
328 uint32_t slots[8];
329 uint32_t slotBits;
330 uint16_t excWord, excIndex, excTop, i, count, length, fullLengths;
331 UBool doubleSlots;
332
333 /* excIndex will be returned for storing in the trie word */
334 excIndex=exceptionsTop;
335 if(excIndex>=UCASE_MAX_EXCEPTIONS) {
336 fprintf(stderr, "gencase error: too many exceptions words\n");
337 exit(U_BUFFER_OVERFLOW_ERROR);
338 }
339
340 excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */
341
342 /* copy and shift the soft-dotted bits */
343 excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT;
344
345 /* update maxFullLength */
346 if(p->specialCasing!=NULL) {
347 length=p->specialCasing->lowerCase[0];
348 if(length>maxFullLength) {
349 maxFullLength=length;
350 }
351 length=p->specialCasing->upperCase[0];
352 if(length>maxFullLength) {
353 maxFullLength=length;
354 }
355 length=p->specialCasing->titleCase[0];
356 if(length>maxFullLength) {
357 maxFullLength=length;
358 }
359 }
360 if(p->caseFolding!=NULL) {
361 length=p->caseFolding->full[0];
362 if(length>maxFullLength) {
363 maxFullLength=length;
364 }
365 }
366
367 /* set the bits for conditional mappings */
368 if(p->specialCasing!=NULL && p->specialCasing->isComplex) {
369 excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
370 p->specialCasing=NULL;
371 }
372 if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
373 excWord|=UCASE_EXC_CONDITIONAL_FOLD;
374 p->caseFolding=NULL;
375 }
376
377 /*
378 * Note:
379 * UCD stores no simple mappings when they are the same as the code point itself.
380 * SpecialCasing and CaseFolding do store simple mappings even if they are
381 * the same as the code point itself.
382 * Comparisons between simple regular mappings and simple special/folding
383 * mappings need to compensate for the difference by comparing with the
384 * original code point if a simple UCD mapping is missing (0).
385 */
386
387 /* remove redundant data */
388 if(p->specialCasing!=NULL) {
389 /* do not store full mappings if they are the same as the simple ones */
390 if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) {
391 p->specialCasing->lowerCase[0]=0;
392 }
393 if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) {
394 p->specialCasing->upperCase[0]=0;
395 }
396 if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) {
397 p->specialCasing->titleCase[0]=0;
398 }
399 }
400 if( p->caseFolding!=NULL &&
401 fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code)
402 ) {
403 p->caseFolding->full[0]=0;
404 }
405
406 /* write the optional slots */
407 slotBits=0;
408 count=0;
409
410 if(p->lowerCase!=0) {
411 slots[count]=(uint32_t)p->lowerCase;
412 slotBits|=slots[count];
413 ++count;
414 excWord|=U_MASK(UCASE_EXC_LOWER);
415 }
416 if( p->caseFolding!=NULL &&
417 p->caseFolding->simple!=0 &&
418 (p->lowerCase!=0 ?
419 p->caseFolding->simple!=p->lowerCase :
420 p->caseFolding->simple!=p->code)
421 ) {
422 slots[count]=(uint32_t)p->caseFolding->simple;
423 slotBits|=slots[count];
424 ++count;
425 excWord|=U_MASK(UCASE_EXC_FOLD);
426 }
427 if(p->upperCase!=0) {
428 slots[count]=(uint32_t)p->upperCase;
429 slotBits|=slots[count];
430 ++count;
431 excWord|=U_MASK(UCASE_EXC_UPPER);
432 }
433 if(p->upperCase!=p->titleCase) {
434 if(p->titleCase!=0) {
435 slots[count]=(uint32_t)p->titleCase;
436 } else {
437 slots[count]=(uint32_t)p->code;
438 }
439 slotBits|=slots[count];
440 ++count;
441 excWord|=U_MASK(UCASE_EXC_TITLE);
442 }
443
444 /* lengths of full case mapping strings, stored in the last slot */
445 fullLengths=0;
446 if(p->specialCasing!=NULL) {
447 fullLengths=p->specialCasing->lowerCase[0];
448 fullLengths|=p->specialCasing->upperCase[0]<<8;
449 fullLengths|=p->specialCasing->titleCase[0]<<12;
450 }
451 if(p->caseFolding!=NULL) {
452 fullLengths|=p->caseFolding->full[0]<<4;
453 }
454 if(fullLengths!=0) {
455 slots[count]=fullLengths;
456 slotBits|=slots[count];
457 ++count;
458 excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS);
459 }
460
461 /* write slots */
462 doubleSlots=(UBool)(slotBits>0xffff);
463 if(!doubleSlots) {
464 for(i=0; i<count; ++i) {
465 exceptions[excTop++]=(uint16_t)slots[i];
466 }
467 } else {
468 excWord|=UCASE_EXC_DOUBLE_SLOTS;
469 for(i=0; i<count; ++i) {
470 exceptions[excTop++]=(uint16_t)(slots[i]>>16);
471 exceptions[excTop++]=(uint16_t)slots[i];
472 }
473 }
474
475 /* write the full case mapping strings */
476 if(p->specialCasing!=NULL) {
477 length=(uint16_t)p->specialCasing->lowerCase[0];
478 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length);
479 excTop+=length;
480 }
481 if(p->caseFolding!=NULL) {
482 length=(uint16_t)p->caseFolding->full[0];
483 u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length);
484 excTop+=length;
485 }
486 if(p->specialCasing!=NULL) {
487 length=(uint16_t)p->specialCasing->upperCase[0];
488 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length);
489 excTop+=length;
490
491 length=(uint16_t)p->specialCasing->titleCase[0];
492 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length);
493 excTop+=length;
494 }
495
496 exceptionsTop=excTop;
497
498 /* write the main exceptions word */
499 exceptions[excIndex]=excWord;
500
501 return excIndex;
502}
503
504extern void
505makeExceptions() {
506 uint32_t *row;
507 uint32_t value;
508 int32_t i;
509 uint16_t excIndex;
510
511 i=0;
512 while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) {
513 value=*row;
514 if(value&UCASE_EXCEPTION) {
515 excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT));
516 *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT);
517 }
518 ++i;
519 }
520}
521
522/* generate output data ----------------------------------------------------- */
523
524extern void
525generateData(const char *dataDir) {
526 static int32_t indexes[UCASE_IX_TOP]={
527 UCASE_IX_TOP
528 };
529 static uint8_t trieBlock[40000];
530
531 const uint32_t *row;
532 UChar32 start, limit;
533 int32_t i;
534
535 UNewDataMemory *pData;
536 UNewTrie *pTrie;
537 UErrorCode errorCode=U_ZERO_ERROR;
538 int32_t trieSize;
539 long dataLength;
540
541 pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
542 if(pTrie==NULL) {
543 fprintf(stderr, "gencase error: unable to create a UNewTrie\n");
544 exit(U_MEMORY_ALLOCATION_ERROR);
545 }
546
547 for(i=0; (row=upvec_getRow(pv, i, &start, &limit))!=NULL; ++i) {
548 if(!utrie_setRange32(pTrie, start, limit, *row, TRUE)) {
549 fprintf(stderr, "gencase error: unable to set trie value (overflow)\n");
550 exit(U_BUFFER_OVERFLOW_ERROR);
551 }
552 }
553
554 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
555 if(U_FAILURE(errorCode)) {
556 fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
557 exit(errorCode);
558 }
559
560 indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop;
561 indexes[UCASE_IX_TRIE_SIZE]=trieSize;
562 indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop;
563
564 indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
565
566 if(beVerbose) {
567 printf("trie size in bytes: %5d\n", (int)trieSize);
568 printf("number of code points with exceptions: %5d\n", exceptionsCount);
569 printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop);
570 printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]);
571 }
572
573 /* write the data */
574 pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo,
575 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
576 if(U_FAILURE(errorCode)) {
577 fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode));
578 exit(errorCode);
579 }
580
581 udata_writeBlock(pData, indexes, sizeof(indexes));
582 udata_writeBlock(pData, trieBlock, trieSize);
583 udata_writeBlock(pData, exceptions, 2*exceptionsTop);
584
585 /* finish up */
586 dataLength=udata_finish(pData, &errorCode);
587 if(U_FAILURE(errorCode)) {
588 fprintf(stderr, "gencase: error %d writing the output file\n", errorCode);
589 exit(errorCode);
590 }
591
592 if(dataLength!=indexes[UCASE_IX_LENGTH]) {
593 fprintf(stderr, "gencase: data length %ld != calculated size %d\n",
594 dataLength, (int)indexes[UCASE_IX_LENGTH]);
595 exit(U_INTERNAL_PROGRAM_ERROR);
596 }
597
598 utrie_close(pTrie);
599}
600
601/*
602 * Hey, Emacs, please set the following:
603 *
604 * Local Variables:
605 * indent-tabs-mode: nil
606 * End:
607 *
608 */