]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genbidi/store.c
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / tools / genbidi / store.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: store.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004dec30
14 * created by: Markus W. Scherer
15 *
16 * Store Unicode bidi/shaping properties efficiently for
17 * random access.
18 */
19
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "utrie.h"
27 #include "uarrsort.h"
28 #include "unicode/udata.h"
29 #include "unewdata.h"
30 #include "propsvec.h"
31 #include "writesrc.h"
32 #include "ubidi_props.h"
33 #include "genbidi.h"
34
35 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
36
37 /* Unicode bidi/shaping properties file format ---------------------------------
38
39 The file format prepared and written here contains several data
40 structures that store indexes or data.
41
42 Before the data contents described below, there are the headers required by
43 the udata API for loading ICU data. Especially, a UDataInfo structure
44 precedes the actual data. It contains platform properties values and the
45 file format version.
46
47 The following is a description of format version 1.0 .
48
49 The file contains the following structures:
50
51 const int32_t indexes[i0] with values i0, i1, ...:
52 (see UBIDI_IX_... constants for names of indexes)
53
54 i0 indexLength; -- length of indexes[] (UBIDI_IX_TOP)
55 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
56 i2 trieSize; -- size in bytes of the bidi/shaping properties trie
57 i3 mirrorLength; -- length in uint32_t of the bidi mirroring array
58
59 i4 jgStart; -- first code point with Joining_Group data
60 i5 jgLimit; -- limit code point for Joining_Group data
61
62 i6..i14 reservedIndexes; -- reserved values; 0 for now
63
64 i15 maxValues; -- maximum code values for enumerated properties
65 bits 23..16 contain the max value for Joining_Group,
66 otherwise the bits are used like enum fields in the trie word
67
68 Serialized trie, see utrie.h;
69
70 const uint32_t mirrors[mirrorLength];
71
72 const uint8_t jgArray[i5-i4]; -- (i5-i4) is always a multiple of 4
73
74 Trie data word:
75 Bits
76 15..13 signed delta to bidi mirroring code point
77 (add delta to input code point)
78 0 no such code point (source maps to itself)
79 -3..-1, 1..3 delta
80 -4 look in mirrors table
81 12 is mirrored
82 11 Bidi_Control
83 10 Join_Control
84 9.. 8 reserved (set to 0)
85 7.. 5 Joining_Type
86 4.. 0 BiDi category
87
88
89 Mirrors:
90 Stores some of the bidi mirroring data, where each code point maps to
91 at most one other.
92 Most code points do not have a mirroring code point; most that do have a signed
93 delta stored in the trie data value. Only those where the delta does not fit
94 into the trie data are stored in this table.
95
96 Logically, this is a two-column table with source and mirror code points.
97
98 Physically, the table is compressed by taking advantage of the fact that each
99 mirror code point is also a source code point
100 (each of them is a mirror of the other).
101 Therefore, both logical columns contain the same set of code points, which needs
102 to be stored only once.
103
104 The table stores source code points, and also for each the index of its mirror
105 code point in the same table, in a simple array of uint32_t.
106 Bits
107 31..21 index to mirror code point (unsigned)
108 20.. 0 source code point
109
110 The table is sorted by source code points.
111
112
113 Joining_Group array:
114 The Joining_Group values do not fit into the 16-bit trie, but the data is also
115 limited to a small range of code points (Arabic and Syriac) and not
116 well compressible.
117
118 The start and limit code points for the range are stored in the indexes[]
119 array, and the jgArray[] stores a byte for each of these code points,
120 containing the Joining_Group value.
121
122 All code points outside of this range have No_Joining_Group (0).
123
124 ----------------------------------------------------------------------------- */
125
126 /* UDataInfo cf. udata.h */
127 static UDataInfo dataInfo={
128 sizeof(UDataInfo),
129 0,
130
131 U_IS_BIG_ENDIAN,
132 U_CHARSET_FAMILY,
133 U_SIZEOF_UCHAR,
134 0,
135
136 /* dataFormat="BiDi" */
137 { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 },
138 { 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
139 { 4, 0, 1, 0 } /* dataVersion */
140 };
141
142 /* exceptions values */
143 static uint32_t mirrors[UBIDI_MAX_MIRROR_INDEX+1][2];
144 static uint16_t mirrorTop=0;
145
146 /* -------------------------------------------------------------------------- */
147
148 extern void
149 setUnicodeVersion(const char *v) {
150 UVersionInfo version;
151 u_versionFromString(version, v);
152 uprv_memcpy(dataInfo.dataVersion, version, 4);
153 }
154
155 /* bidi mirroring table ----------------------------------------------------- */
156
157 extern void
158 addMirror(UChar32 src, UChar32 mirror) {
159 UErrorCode errorCode;
160 int32_t delta;
161
162 delta=mirror-src;
163 if(delta==0) {
164 return; /* mapping to self=no mapping */
165 }
166
167 if(delta<UBIDI_MIN_MIRROR_DELTA || UBIDI_MAX_MIRROR_DELTA<delta) {
168 /* delta does not fit into the trie properties value, store in the mirrors[] table */
169 if(mirrorTop==LENGTHOF(mirrors)) {
170 fprintf(stderr, "genbidi error: too many long-distance mirroring mappings\n");
171 exit(U_BUFFER_OVERFLOW_ERROR);
172 }
173
174 /* possible: search the table so far and see if src is already listed */
175
176 mirrors[mirrorTop][0]=(uint32_t)src;
177 mirrors[mirrorTop][1]=(uint32_t)mirror;
178 ++mirrorTop;
179
180 /* set an escape marker in src's properties */
181 delta=UBIDI_ESC_MIRROR_DELTA;
182 }
183
184 errorCode=U_ZERO_ERROR;
185 if(
186 !upvec_setValue(
187 pv, src, src+1, 0,
188 (uint32_t)delta<<UBIDI_MIRROR_DELTA_SHIFT, (uint32_t)(-1)<<UBIDI_MIRROR_DELTA_SHIFT,
189 &errorCode)
190 ) {
191 fprintf(stderr, "genbidi error: unable to set mirroring delta, code: %s\n",
192 u_errorName(errorCode));
193 exit(errorCode);
194 }
195 }
196
197 static int32_t U_CALLCONV
198 compareMirror(const void *context, const void *left, const void *right) {
199 UChar32 l, r;
200
201 l=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)left)[0]);
202 r=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)right)[0]);
203 return l-r;
204 }
205
206 static void
207 makeMirror() {
208 uint32_t *reducedMirror;
209 UErrorCode errorCode;
210 int32_t i, j, start, limit, step;
211 uint32_t c;
212
213 /* sort the mirroring table by source code points */
214 errorCode=U_ZERO_ERROR;
215 uprv_sortArray(mirrors, mirrorTop, 8,
216 compareMirror, NULL, FALSE, &errorCode);
217
218 /*
219 * reduce the 2-column table to a single column
220 * by putting the index to the mirror entry into the source entry
221 *
222 * first:
223 * find each mirror code point in the source column and set each other's indexes
224 *
225 * second:
226 * reduce the table, combine the source code points with their indexes
227 * and store as a simple array of uint32_t
228 */
229 for(i=0; i<mirrorTop; ++i) {
230 c=mirrors[i][1]; /* mirror code point */
231 if(c>0x1fffff) {
232 continue; /* this entry already has an index */
233 }
234
235 /* search for the mirror code point in the source column */
236 if(c<mirrors[i][0]) {
237 /* search before i */
238 start=i-1;
239 limit=-1;
240 step=-1;
241 } else {
242 start=i+1;
243 limit=mirrorTop;
244 step=1;
245 }
246
247 for(j=start;; j+=step) {
248 if(j==limit) {
249 fprintf(stderr,
250 "genbidi error: bidi mirror does not roundtrip - %04lx->%04lx->?\n",
251 (long)mirrors[i][0], (long)mirrors[i][1]);
252 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
253 }
254 if(c==mirrors[j][0]) {
255 /*
256 * found the mirror code point c in the source column,
257 * set both entries' indexes to each other
258 */
259 if(UBIDI_GET_MIRROR_CODE_POINT(mirrors[i][0])!=UBIDI_GET_MIRROR_CODE_POINT(mirrors[j][1])) {
260 /* roundtrip check fails */
261 fprintf(stderr,
262 "genbidi error: bidi mirrors do not roundtrip - %04lx->%04lx->%04lx\n",
263 (long)mirrors[i][0], (long)mirrors[i][1], (long)mirrors[j][1]);
264 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
265 } else {
266 mirrors[i][1]|=(uint32_t)j<<UBIDI_MIRROR_INDEX_SHIFT;
267 mirrors[j][1]|=(uint32_t)i<<UBIDI_MIRROR_INDEX_SHIFT;
268 }
269 break;
270 }
271 }
272 }
273
274 /* now the second step, the actual reduction of the table */
275 reducedMirror=mirrors[0];
276 for(i=0; i<mirrorTop; ++i) {
277 reducedMirror[i]=mirrors[i][0]|(mirrors[i][1]&~0x1fffff);
278 }
279
280 if(U_FAILURE(errorCode)) {
281 exit(errorCode);
282 }
283 }
284
285 /* generate output data ----------------------------------------------------- */
286
287 extern void
288 generateData(const char *dataDir, UBool csource) {
289 static int32_t indexes[UBIDI_IX_TOP]={
290 UBIDI_IX_TOP
291 };
292 static uint8_t trieBlock[40000];
293 static uint8_t jgArray[0x300]; /* at most for U+0600..U+08FF */
294
295 const uint32_t *row;
296 UChar32 start, limit, prev, jgStart;
297 int32_t i;
298
299 UNewDataMemory *pData;
300 UNewTrie *pTrie;
301 UErrorCode errorCode=U_ZERO_ERROR;
302 int32_t trieSize;
303 long dataLength;
304
305 makeMirror();
306
307 pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
308 if(pTrie==NULL) {
309 fprintf(stderr, "genbidi error: unable to create a UNewTrie\n");
310 exit(U_MEMORY_ALLOCATION_ERROR);
311 }
312
313 prev=jgStart=0;
314 for(i=0; (row=upvec_getRow(pv, i, &start, &limit))!=NULL; ++i) {
315 /* store most values from vector column 0 in the trie */
316 if(!utrie_setRange32(pTrie, start, limit, *row, TRUE)) {
317 fprintf(stderr, "genbidi error: unable to set trie value (overflow)\n");
318 exit(U_BUFFER_OVERFLOW_ERROR);
319 }
320
321 /* store Joining_Group values from vector column 1 in a simple byte array */
322 if(row[1]!=0) {
323 if(start<0x600 || 0x900<=limit) {
324 fprintf(stderr, "genbidi error: Joining_Group for out-of-range code points U+%04lx..U+%04lx\n",
325 (long)start, (long)limit);
326 exit(U_ILLEGAL_ARGUMENT_ERROR);
327 }
328
329 if(prev==0) {
330 /* first code point with any value */
331 prev=jgStart=start;
332 } else {
333 /* add No_Joining_Group for code points between prev and start */
334 while(prev<start) {
335 jgArray[prev++ -jgStart]=0;
336 }
337 }
338
339 /* set Joining_Group value for start..limit */
340 while(prev<limit) {
341 jgArray[prev++ -jgStart]=(uint8_t)row[1];
342 }
343 }
344 }
345
346 /* finish jgArray, pad to multiple of 4 */
347 while((prev-jgStart)&3) {
348 jgArray[prev++ -jgStart]=0;
349 }
350 indexes[UBIDI_IX_JG_START]=jgStart;
351 indexes[UBIDI_IX_JG_LIMIT]=prev;
352
353 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
354 if(U_FAILURE(errorCode)) {
355 fprintf(stderr, "genbidi error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
356 exit(errorCode);
357 }
358
359 indexes[UBIDI_IX_TRIE_SIZE]=trieSize;
360 indexes[UBIDI_IX_MIRROR_LENGTH]=mirrorTop;
361 indexes[UBIDI_IX_LENGTH]=
362 (int32_t)sizeof(indexes)+
363 trieSize+
364 4*mirrorTop+
365 (prev-jgStart);
366
367 if(beVerbose) {
368 printf("trie size in bytes: %5d\n", (int)trieSize);
369 printf("size in bytes of mirroring table: %5d\n", (int)(4*mirrorTop));
370 printf("length of Joining_Group array: %5d (U+%04x..U+%04x)\n", (int)(prev-jgStart), (int)jgStart, (int)(prev-1));
371 printf("data size: %5d\n", (int)indexes[UBIDI_IX_LENGTH]);
372 }
373
374 indexes[UBIDI_MAX_VALUES_INDEX]=
375 ((int32_t)U_CHAR_DIRECTION_COUNT-1)|
376 (((int32_t)U_JT_COUNT-1)<<UBIDI_JT_SHIFT)|
377 (((int32_t)U_JG_COUNT-1)<<UBIDI_MAX_JG_SHIFT);
378
379 if(csource) {
380 /* write .c file for hardcoded data */
381 UTrie trie={ NULL };
382 FILE *f;
383
384 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
385 if(U_FAILURE(errorCode)) {
386 fprintf(
387 stderr,
388 "genbidi error: failed to utrie_unserialize(ubidi.icu trie) - %s\n",
389 u_errorName(errorCode));
390 return;
391 }
392
393 f=usrc_create(dataDir, "ubidi_props_data.c");
394 if(f!=NULL) {
395 usrc_writeArray(f,
396 "static const UVersionInfo ubidi_props_dataVersion={",
397 dataInfo.dataVersion, 8, 4,
398 "};\n\n");
399 usrc_writeArray(f,
400 "static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={",
401 indexes, 32, UBIDI_IX_TOP,
402 "};\n\n");
403 usrc_writeUTrieArrays(f,
404 "static const uint16_t ubidi_props_trieIndex[%ld]={\n", NULL,
405 &trie,
406 "\n};\n\n");
407 usrc_writeArray(f,
408 "static const uint32_t ubidi_props_mirrors[%ld]={\n",
409 mirrors, 32, mirrorTop,
410 "\n};\n\n");
411 usrc_writeArray(f,
412 "static const uint8_t ubidi_props_jgArray[%ld]={\n",
413 jgArray, 8, prev-jgStart,
414 "\n};\n\n");
415 fputs(
416 "static const UBiDiProps ubidi_props_singleton={\n"
417 " NULL,\n"
418 " ubidi_props_indexes,\n"
419 " ubidi_props_mirrors,\n"
420 " ubidi_props_jgArray,\n",
421 f);
422 usrc_writeUTrieStruct(f,
423 " {\n",
424 &trie, "ubidi_props_trieIndex", NULL, NULL,
425 " },\n");
426 usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n");
427 fputs("};\n", f);
428 fclose(f);
429 }
430 } else {
431 /* write the data */
432 pData=udata_create(dataDir, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &dataInfo,
433 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
434 if(U_FAILURE(errorCode)) {
435 fprintf(stderr, "genbidi: unable to create data memory, %s\n", u_errorName(errorCode));
436 exit(errorCode);
437 }
438
439 udata_writeBlock(pData, indexes, sizeof(indexes));
440 udata_writeBlock(pData, trieBlock, trieSize);
441 udata_writeBlock(pData, mirrors, 4*mirrorTop);
442 udata_writeBlock(pData, jgArray, prev-jgStart);
443
444 /* finish up */
445 dataLength=udata_finish(pData, &errorCode);
446 if(U_FAILURE(errorCode)) {
447 fprintf(stderr, "genbidi: error %d writing the output file\n", errorCode);
448 exit(errorCode);
449 }
450
451 if(dataLength!=indexes[UBIDI_IX_LENGTH]) {
452 fprintf(stderr, "genbidi: data length %ld != calculated size %d\n",
453 dataLength, (int)indexes[UBIDI_IX_LENGTH]);
454 exit(U_INTERNAL_PROGRAM_ERROR);
455 }
456 }
457
458 utrie_close(pTrie);
459 upvec_close(pv);
460 }
461
462 /*
463 * Hey, Emacs, please set the following:
464 *
465 * Local Variables:
466 * indent-tabs-mode: nil
467 * End:
468 *
469 */