]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genprops/store.c
ICU-400.40.tar.gz
[apple/icu.git] / icuSources / tools / genprops / store.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
46f4442e 4* Copyright (C) 1999-2008, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: store.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 1999dec11
14* created by: Markus W. Scherer
15*
16* Store Unicode character properties efficiently for
17* random access.
18*/
19
20#include <stdio.h>
b75a7d8f
A
21#include "unicode/utypes.h"
22#include "unicode/uchar.h"
23#include "cmemory.h"
24#include "cstring.h"
b75a7d8f
A
25#include "utrie.h"
26#include "unicode/udata.h"
27#include "unewdata.h"
73c04bcf 28#include "writesrc.h"
b75a7d8f
A
29#include "uprops.h"
30#include "genprops.h"
31
32#define DO_DEBUG_OUT 0
33
34/* Unicode character properties file format ------------------------------------
35
36The file format prepared and written here contains several data
37structures that store indexes or data.
38
39Before the data contents described below, there are the headers required by
40the udata API for loading ICU data. Especially, a UDataInfo structure
41precedes the actual data. It contains platform properties values and the
42file format version.
43
46f4442e 44The following is a description of format version 5 .
73c04bcf
A
45
46The format changes between version 3 and 4 because the properties related to
47case mappings and bidi/shaping are pulled out into separate files
48for modularization.
49In order to reduce the need for code changes, some of the previous data
50structures are omitted, rather than rearranging everything.
51
52For details see "Changes in format version 4" below.
b75a7d8f 53
46f4442e
A
54Format version 5 became necessary because the bit field for script codes
55overflowed. Several bit fields got rearranged, and three (Script, Block,
56Word_Break) got widened by one bit each.
57
b75a7d8f
A
58Data contents:
59
60The contents is a parsed, binary form of several Unicode character
61database files, most prominently UnicodeData.txt.
62
63Any Unicode code point from 0 to 0x10ffff can be looked up to get
64the properties, if any, for that code point. This means that the input
65to the lookup are 21-bit unsigned integers, with not all of the
6621-bit range used.
67
68It is assumed that client code keeps a uint32_t pointer
69to the beginning of the data:
70
71 const uint32_t *p32;
72
73Formally, the file contains the following structures:
74
75 const int32_t indexes[16] with values i0..i15:
76
73c04bcf
A
77 i0 indicates the length of the main trie.
78 i0..i3 all have the same value in format version 4.0;
79 the related props32[] and exceptions[] and uchars[] were used in format version 3
80
b75a7d8f
A
81 i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
82 i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
83 i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
84
85 i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
86 i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
87 i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
88
89 i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
90 i7..i9 reservedIndexes; -- reserved values; 0 for now
91
73c04bcf
A
92 i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
93 i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
b75a7d8f
A
94 i12..i15 reservedIndexes; -- reserved values; 0 for now
95
96 PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
97
73c04bcf
A
98 P, E, and U are not used (empty) in format version 4
99
b75a7d8f
A
100 P const uint32_t props32[i1-i0];
101 E const uint32_t exceptions[i2-i1];
102 U const UChar uchars[2*(i3-i2)];
103
104 AT serialized trie for additional properties (byte size: 4*(i4-i3))
105 PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
106
107Trie lookup and properties:
108
109In order to condense the data for the 21-bit code space, several properties of
110the Unicode code assignment are exploited:
111- The code space is sparse.
112- There are several 10k of consecutive codes with the same properties.
113- Characters and scripts are allocated in groups of 16 code points.
114- Inside blocks for scripts the properties are often repetitive.
115- The 21-bit space is not fully used for Unicode.
116
117The lookup of properties for a given code point is done with a trie lookup,
118using the UTrie implementation.
73c04bcf 119The trie lookup result is a 16-bit properties word.
b75a7d8f
A
120
121With a given Unicode code point
122
123 UChar32 c;
124
125and 0<=c<0x110000, the lookup is done like this:
126
73c04bcf
A
127 uint16_t props;
128 UTRIE_GET16(trie, c, props);
b75a7d8f 129
73c04bcf 130Each 16-bit properties word contains:
b75a7d8f
A
131
132 0.. 4 general category
73c04bcf
A
133 5.. 7 numeric type
134 non-digit numbers are stored with multiple types and pseudo-types
135 in order to facilitate compact encoding:
136 0 no numeric value (0)
137 1 decimal digit value (0..9)
138 2 digit value (0..9)
139 3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff
140 4 (internal type UPROPS_NT_FRACTION) fraction
141 5 (internal type UPROPS_NT_LARGE) large number >0xff
142 6..7 reserved
143
144 when returning the numeric type from a public API,
145 internal types must be turned into U_NT_NUMERIC
146
147 8..15 numeric value
148 encoding of fractions and large numbers see below
149
150Fractions:
151 // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
152 int32_t num, den;
153 num=n>>3; // num=0..31
154 den=(n&7)+2; // den=2..9
155 if(num==0) {
156 num=-1; // num=-1 or 1..31
157 }
158 double result=(double)num/(double)den;
159
160Large numbers:
161 // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
162 int32_t m, e;
163 m=n>>4; // m=0..15
164 e=(n&0xf);
165 if(m==0) {
166 m=1; // for large powers of 10
167 e+=18; // e=18..33
168 } else {
169 e+=2; // e=2..17
170 } // m==10..15 are reserved
171 double result=(double)m*10^e;
b75a7d8f
A
172
173--- Additional properties (new in format version 2.1) ---
174
175The second trie for additional properties (AT) is also a UTrie with 16-bit data.
176The data words consist of 32-bit unit indexes (not row indexes!) into the
177table of unique properties vectors (PV).
178Each vector contains a set of properties.
179The width of a vector (number of uint32_t per row) may change
180with the formatVersion, it is stored in i5.
181
182Current properties: see icu/source/common/uprops.h
183
184--- Changes in format version 3.1 ---
185
186See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
187
188--- Changes in format version 3.2 ---
189
190- The tries use linear Latin-1 ranges.
191- The additional properties bits store full properties XYZ instead
192 of partial Other_XYZ, so that changes in the derivation formulas
193 need not be tracked in runtime library code.
194- Joining Type and Line Break are also stored completely, so that uprops.c
195 needs no runtime formulas for enumerated properties either.
196- Store the case-sensitive flag in the main properties word.
197- i10 also contains U_LB_COUNT and U_EA_COUNT.
198- i11 contains maxValues2 for vector word 2.
199
73c04bcf
A
200--- Changes in format version 4 ---
201
202The format changes between version 3 and 4 because the properties related to
203case mappings and bidi/shaping are pulled out into separate files
204for modularization.
205In order to reduce the need for code changes, some of the previous data
206structures are omitted, rather than rearranging everything.
207
208(The change to format version 4 is for ICU 3.4. The last CVS revision of
209genprops/store.c for format version 3.2 is 1.48.)
210
211The main trie's data is significantly simplified:
212- The trie's 16-bit data word is used directly instead of as an index
213 into props32[].
214- The trie uses the default trie folding functions instead of custom ones.
215- Numeric values are stored directly in the trie data word, with special
216 encodings.
217- No more exception data (the data that needed it was pulled out, or, in the
218 case of numeric values, encoded differently).
219- No more string data (pulled out - was for case mappings).
220
221Also, some of the previously used properties vector bits are reserved again.
222
223The indexes[] values for the omitted structures are still filled in
224(indicating zero-length arrays) so that the swapper code remains unchanged.
225
46f4442e
A
226--- Changes in format version 5 ---
227
228Rearranged bit fields in the second trie (AT) because the script code field
229overflowed. Old code would have seen nonsensically low values for new, higher
230script codes.
231Modified bit fields in icu/source/common/uprops.h
232
b75a7d8f
A
233----------------------------------------------------------------------------- */
234
235/* UDataInfo cf. udata.h */
236static UDataInfo dataInfo={
237 sizeof(UDataInfo),
238 0,
239
240 U_IS_BIG_ENDIAN,
241 U_CHARSET_FAMILY,
242 U_SIZEOF_UCHAR,
243 0,
244
245 { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
46f4442e
A
246 { 5, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
247 { 5, 1, 0, 0 } /* dataVersion */
b75a7d8f
A
248};
249
b75a7d8f
A
250static UNewTrie *pTrie=NULL;
251
b75a7d8f
A
252/* -------------------------------------------------------------------------- */
253
254extern void
255setUnicodeVersion(const char *v) {
256 UVersionInfo version;
257 u_versionFromString(version, v);
258 uprv_memcpy(dataInfo.dataVersion, version, 4);
259}
260
261extern void
262initStore() {
73c04bcf 263 pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
b75a7d8f
A
264 if(pTrie==NULL) {
265 fprintf(stderr, "error: unable to create a UNewTrie\n");
266 exit(U_MEMORY_ALLOCATION_ERROR);
267 }
268
b75a7d8f
A
269 initAdditionalProperties();
270}
271
73c04bcf
A
272extern void
273exitStore() {
274 utrie_close(pTrie);
275 exitAdditionalProperties();
276}
277
278static uint32_t printNumericTypeValueError(Props *p) {
279 fprintf(stderr, "genprops error: unable to encode numeric type & value %d %ld/%lu E%d\n",
280 (int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent);
281 exit(U_ILLEGAL_ARGUMENT_ERROR);
282 return 0;
283}
284
b75a7d8f
A
285/* store a character's properties ------------------------------------------- */
286
287extern uint32_t
288makeProps(Props *p) {
73c04bcf
A
289 uint32_t den;
290 int32_t type, value, exp;
291
292 /* encode numeric type & value */
293 type=p->numericType;
294 value=p->numericValue;
295 den=p->denominator;
296 exp=p->exponent;
297
298 if(den!=0) {
299 /* fraction */
300 if( type!=U_NT_NUMERIC ||
301 value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM ||
302 den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den ||
303 exp!=0
304 ) {
305 return printNumericTypeValueError(p);
b75a7d8f 306 }
73c04bcf
A
307 type=UPROPS_NT_FRACTION;
308
309 if(value==-1) {
310 value=0;
b75a7d8f 311 }
73c04bcf
A
312 den-=UPROPS_FRACTION_DEN_OFFSET;
313 value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den;
314 } else if(exp!=0) {
315 /* very large value */
316 if( type!=U_NT_NUMERIC ||
317 value<1 || 9<value ||
318 exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp
319 ) {
320 return printNumericTypeValueError(p);
b75a7d8f 321 }
73c04bcf 322 type=UPROPS_NT_LARGE;
b75a7d8f 323
73c04bcf
A
324 if(exp<=UPROPS_LARGE_MAX_EXP) {
325 /* 1..9 * 10^(2..17) */
326 exp-=UPROPS_LARGE_EXP_OFFSET;
327 } else {
328 /* 1 * 10^(18..33) */
329 if(value!=1) {
330 return printNumericTypeValueError(p);
b75a7d8f 331 }
73c04bcf
A
332 value=0;
333 exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA;
b75a7d8f 334 }
73c04bcf
A
335 value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
336 } else if(value>UPROPS_MAX_SMALL_NUMBER) {
337 /* large value */
338 if(type!=U_NT_NUMERIC) {
339 return printNumericTypeValueError(p);
340 }
341 type=UPROPS_NT_LARGE;
b75a7d8f 342
73c04bcf
A
343 /* split the value into mantissa and exponent, base 10 */
344 while((value%10)==0) {
345 value/=10;
346 ++exp;
347 }
348 if(value>9) {
349 return printNumericTypeValueError(p);
350 }
b75a7d8f 351
73c04bcf
A
352 exp-=UPROPS_LARGE_EXP_OFFSET;
353 value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
354 } else if(value<0) {
355 /* unable to encode negative values, other than fractions -1/x */
356 return printNumericTypeValueError(p);
b75a7d8f 357
73c04bcf 358 /* } else normal value=0..0xff { */
b75a7d8f
A
359 }
360
73c04bcf
A
361 /* encode the properties */
362 return
b75a7d8f 363 (uint32_t)p->generalCategory |
73c04bcf
A
364 ((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) |
365 ((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT);
b75a7d8f
A
366}
367
368extern void
369addProps(uint32_t c, uint32_t x) {
370 if(!utrie_set32(pTrie, (UChar32)c, x)) {
371 fprintf(stderr, "error: too many entries for the properties trie\n");
372 exit(U_BUFFER_OVERFLOW_ERROR);
373 }
374}
375
b75a7d8f
A
376extern uint32_t
377getProps(uint32_t c) {
378 return utrie_get32(pTrie, (UChar32)c, NULL);
379}
380
381/* areas of same properties ------------------------------------------------- */
382
383extern void
384repeatProps(uint32_t first, uint32_t last, uint32_t x) {
385 if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
386 fprintf(stderr, "error: too many entries for the properties trie\n");
387 exit(U_BUFFER_OVERFLOW_ERROR);
388 }
389}
390
b75a7d8f
A
391/* generate output data ----------------------------------------------------- */
392
b75a7d8f 393extern void
73c04bcf 394generateData(const char *dataDir, UBool csource) {
b75a7d8f
A
395 static int32_t indexes[UPROPS_INDEX_COUNT]={
396 0, 0, 0, 0,
397 0, 0, 0, 0,
398 0, 0, 0, 0,
399 0, 0, 0, 0
400 };
401 static uint8_t trieBlock[40000];
402 static uint8_t additionalProps[120000];
403
404 UNewDataMemory *pData;
405 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf 406 uint32_t size = 0;
b75a7d8f
A
407 int32_t trieSize, additionalPropsSize, offset;
408 long dataLength;
409
73c04bcf 410 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
b75a7d8f
A
411 if(U_FAILURE(errorCode)) {
412 fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
413 exit(errorCode);
414 }
415
416 offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
417
73c04bcf 418 /* round up trie size to 4-alignment */
b75a7d8f
A
419 trieSize=(trieSize+3)&~3;
420 offset+=trieSize>>2;
73c04bcf
A
421 indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */
422 indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */
423 indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */
b75a7d8f
A
424 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
425
426 if(beVerbose) {
374ca955 427 printf("trie size in bytes: %5u\n", (int)trieSize);
b75a7d8f
A
428 }
429
73c04bcf
A
430 if(csource) {
431 /* write .c file for hardcoded data */
432 UTrie trie={ NULL };
433 FILE *f;
434
435 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
436 if(U_FAILURE(errorCode)) {
437 fprintf(
438 stderr,
439 "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n",
440 u_errorName(errorCode));
441 return;
442 }
b75a7d8f 443
73c04bcf
A
444 f=usrc_create(dataDir, "uchar_props_data.c");
445 if(f!=NULL) {
446 usrc_writeArray(f,
447 "static const UVersionInfo formatVersion={",
448 dataInfo.formatVersion, 8, 4,
449 "};\n\n");
450 usrc_writeArray(f,
451 "static const UVersionInfo dataVersion={",
452 dataInfo.dataVersion, 8, 4,
453 "};\n\n");
454 usrc_writeUTrieArrays(f,
455 "static const uint16_t propsTrie_index[%ld]={\n", NULL,
456 &trie,
457 "\n};\n\n");
458 usrc_writeUTrieStruct(f,
459 "static const UTrie propsTrie={\n",
460 &trie, "propsTrie_index", NULL, NULL,
461 "};\n\n");
462
463 additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes);
464 size=4*offset+additionalPropsSize; /* total size of data */
465
466 usrc_writeArray(f,
467 "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
468 indexes, 32, UPROPS_INDEX_COUNT,
469 "};\n\n");
470 fclose(f);
471 }
472 } else {
473 /* write the data */
474 pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
475 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
476 if(U_FAILURE(errorCode)) {
477 fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
478 exit(errorCode);
479 }
b75a7d8f 480
73c04bcf
A
481 additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes);
482 size=4*offset+additionalPropsSize; /* total size of data */
b75a7d8f 483
73c04bcf
A
484 udata_writeBlock(pData, indexes, sizeof(indexes));
485 udata_writeBlock(pData, trieBlock, trieSize);
486 udata_writeBlock(pData, additionalProps, additionalPropsSize);
b75a7d8f 487
73c04bcf
A
488 /* finish up */
489 dataLength=udata_finish(pData, &errorCode);
490 if(U_FAILURE(errorCode)) {
491 fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
492 exit(errorCode);
493 }
b75a7d8f 494
73c04bcf
A
495 if(dataLength!=(long)size) {
496 fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
497 dataLength, (unsigned long)size);
498 exit(U_INTERNAL_PROGRAM_ERROR);
499 }
b75a7d8f
A
500 }
501
73c04bcf
A
502 if(beVerbose) {
503 printf("data size: %6lu\n", (unsigned long)size);
b75a7d8f 504 }
b75a7d8f
A
505}
506
507/*
508 * Hey, Emacs, please set the following:
509 *
510 * Local Variables:
511 * indent-tabs-mode: nil
512 * End:
513 *
514 */