]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genuca/genuca.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / tools / genuca / genuca.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2000-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: genuca.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created at the end of XX century
14 * created by: Vladimir Weinstein
15 *
16 * This program reads the Franctional UCA table and generates
17 * internal format for UCA table as well as inverse UCA table.
18 * It then writes binary files containing the data: ucadata.dat
19 * & invuca.dat
20 * Change history:
21 * 02/23/2001 grhoten Made it into a tool
22 * 02/23/2001 weiv Moved element & table handling code to i18n
23 * 05/09/2001 weiv Case bits are now in the CEs, not in front
24 */
25
26 #include <stdio.h>
27 #include "unicode/utypes.h"
28 #include "unicode/udata.h"
29 #include "ucol_imp.h"
30 #include "genuca.h"
31 #include "uoptions.h"
32 #include "toolutil.h"
33 #include "unewdata.h"
34 #include "cstring.h"
35 #include "cmemory.h"
36
37 /*
38 * Global - verbosity
39 */
40 UBool VERBOSE = FALSE;
41
42 static UVersionInfo UCAVersion;
43
44 #if UCONFIG_NO_COLLATION
45
46 /* dummy UDataInfo cf. udata.h */
47 static UDataInfo dummyDataInfo = {
48 sizeof(UDataInfo),
49 0,
50
51 U_IS_BIG_ENDIAN,
52 U_CHARSET_FAMILY,
53 U_SIZEOF_UCHAR,
54 0,
55
56 { 0, 0, 0, 0 }, /* dummy dataFormat */
57 { 0, 0, 0, 0 }, /* dummy formatVersion */
58 { 0, 0, 0, 0 } /* dummy dataVersion */
59 };
60
61 #else
62
63 UCAElements le;
64
65 int32_t readElement(char **from, char *to, char separator, UErrorCode *status) {
66 if(U_FAILURE(*status)) {
67 return 0;
68 }
69 char buffer[1024];
70 int32_t i = 0;
71 while(**from != separator) {
72 if(**from != ' ') {
73 *(buffer+i++) = **from;
74 }
75 (*from)++;
76 }
77 (*from)++;
78 *(buffer + i) = 0;
79 //*to = (char *)malloc(strlen(buffer)+1);
80 strcpy(to, buffer);
81 return i/2;
82 }
83
84
85 uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UErrorCode *status) {
86 if(U_FAILURE(*status)) {
87 return 0;
88 }
89 uint32_t value = 0;
90 char primsave = '\0';
91 char secsave = '\0';
92 char tersave = '\0';
93 char *primend = primary+4;
94 if(strlen(primary) > 4) {
95 primsave = *primend;
96 *primend = '\0';
97 }
98 char *secend = secondary+2;
99 if(strlen(secondary) > 2) {
100 secsave = *secend;
101 *secend = '\0';
102 }
103 char *terend = tertiary+2;
104 if(strlen(tertiary) > 2) {
105 tersave = *terend;
106 *terend = '\0';
107 }
108 uint32_t primvalue = (uint32_t)((*primary!='\0')?strtoul(primary, &primend, 16):0);
109 uint32_t secvalue = (uint32_t)((*secondary!='\0')?strtoul(secondary, &secend, 16):0);
110 uint32_t tervalue = (uint32_t)((*tertiary!='\0')?strtoul(tertiary, &terend, 16):0);
111 if(primvalue <= 0xFF) {
112 primvalue <<= 8;
113 }
114
115 value = ((primvalue<<UCOL_PRIMARYORDERSHIFT)&UCOL_PRIMARYORDERMASK)|
116 ((secvalue<<UCOL_SECONDARYORDERSHIFT)&UCOL_SECONDARYORDERMASK)|
117 (tervalue&UCOL_TERTIARYORDERMASK);
118
119 if(primsave!='\0') {
120 *primend = primsave;
121 }
122 if(secsave!='\0') {
123 *secend = secsave;
124 }
125 if(tersave!='\0') {
126 *terend = tersave;
127 }
128 return value;
129 }
130
131 static uint32_t inverseTable[0xFFFF][3];
132 static uint32_t inversePos = 0;
133 static UChar stringContinue[0xFFFF];
134 static uint32_t sContPos = 0;
135
136 static void addNewInverse(UCAElements *element, UErrorCode *status) {
137 if(U_FAILURE(*status)) {
138 return;
139 }
140 if(VERBOSE && isContinuation(element->CEs[1])) {
141 //fprintf(stdout, "+");
142 }
143 inversePos++;
144 inverseTable[inversePos][0] = element->CEs[0];
145 if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
146 inverseTable[inversePos][1] = element->CEs[1];
147 } else {
148 inverseTable[inversePos][1] = 0;
149 }
150 if(element->cSize < 2) {
151 inverseTable[inversePos][2] = element->cPoints[0];
152 } else { /* add a new store of cruft */
153 inverseTable[inversePos][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos;
154 memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
155 sContPos += element->cSize+1;
156 }
157 }
158
159 static void insertInverse(UCAElements *element, uint32_t position, UErrorCode *status) {
160 if(U_FAILURE(*status)) {
161 return;
162 }
163
164 if(VERBOSE && isContinuation(element->CEs[1])) {
165 //fprintf(stdout, "+");
166 }
167 if(position <= inversePos) {
168 /*move stuff around */
169 uint32_t amountToMove = (inversePos - position+1)*sizeof(inverseTable[0]);
170 uprv_memmove(inverseTable[position+1], inverseTable[position], amountToMove);
171 }
172 inverseTable[position][0] = element->CEs[0];
173 if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
174 inverseTable[position][1] = element->CEs[1];
175 } else {
176 inverseTable[position][1] = 0;
177 }
178 if(element->cSize < 2) {
179 inverseTable[position][2] = element->cPoints[0];
180 } else { /* add a new store of cruft */
181 inverseTable[position][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos;
182 memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
183 sContPos += element->cSize+1;
184 }
185 inversePos++;
186 }
187
188 static void addToExistingInverse(UCAElements *element, uint32_t position, UErrorCode *status) {
189
190 if(U_FAILURE(*status)) {
191 return;
192 }
193
194 if((inverseTable[position][2] & UCOL_INV_SIZEMASK) == 0) { /* single element, have to make new extension place and put both guys there */
195 stringContinue[sContPos] = (UChar)inverseTable[position][2];
196 inverseTable[position][2] = ((element->cSize+3) << UCOL_INV_SHIFTVALUE) | sContPos;
197 sContPos++;
198 stringContinue[sContPos++] = 0xFFFF;
199 memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
200 sContPos += element->cSize;
201 stringContinue[sContPos++] = 0xFFFE;
202 } else { /* adding to the already existing continuing table */
203 uint32_t contIndex = inverseTable[position][2] & UCOL_INV_OFFSETMASK;
204 uint32_t contSize = (inverseTable[position][2] & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE;
205
206 if(contIndex+contSize < sContPos) {
207 /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/
208 memcpy(stringContinue+contIndex+contSize+element->cSize+1, stringContinue+contIndex+contSize, (element->cSize+1)*sizeof(UChar));
209 }
210
211 stringContinue[contIndex+contSize-1] = 0xFFFF;
212 memcpy(stringContinue+contIndex+contSize, element->cPoints, element->cSize*sizeof(UChar));
213 sContPos += element->cSize+1;
214 stringContinue[contIndex+contSize+element->cSize] = 0xFFFE;
215
216 inverseTable[position][2] = ((contSize+element->cSize+1) << UCOL_INV_SHIFTVALUE) | contIndex;
217 }
218 }
219
220 static uint32_t addToInverse(UCAElements *element, UErrorCode *status) {
221 uint32_t comp = 0;
222 uint32_t position = inversePos;
223 uint32_t saveElement = element->CEs[0];
224 element->CEs[0] &= 0xFFFFFF3F;
225 if(element->noOfCEs == 1) {
226 element->CEs[1] = 0;
227 }
228 if(inversePos == 0) {
229 inverseTable[0][0] = inverseTable[0][1] = inverseTable[0][2] = 0;
230 addNewInverse(element, status);
231 } else if(inverseTable[inversePos][0] > element->CEs[0]) {
232 while(inverseTable[--position][0] > element->CEs[0]) {}
233 if(VERBOSE) { fprintf(stdout, "p:%i ", position); }
234 if(inverseTable[position][0] == element->CEs[0]) {
235 if(isContinuation(element->CEs[1])) {
236 comp = element->CEs[1];
237 } else {
238 comp = 0;
239 }
240 if(inverseTable[position][1] > comp) {
241 while(inverseTable[--position][1] > comp) {}
242 }
243 if(inverseTable[position][1] == comp) {
244 addToExistingInverse(element, position, status);
245 } else {
246 insertInverse(element, position+1, status);
247 }
248 } else {
249 if(VERBOSE) { fprintf(stdout, "ins"); }
250 insertInverse(element, position+1, status);
251 }
252 } else if(inverseTable[inversePos][0] == element->CEs[0]) {
253 if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
254 comp = element->CEs[1];
255 if(inverseTable[position][1] > comp) {
256 while(inverseTable[--position][1] > comp) {}
257 }
258 if(inverseTable[position][1] == comp) {
259 addToExistingInverse(element, position, status);
260 } else {
261 insertInverse(element, position+1, status);
262 }
263 } else {
264 addToExistingInverse(element, inversePos, status);
265 }
266 } else {
267 addNewInverse(element, status);
268 }
269 element->CEs[0] = saveElement;
270 if(VERBOSE) { fprintf(stdout, "+"); }
271 return inversePos;
272 }
273
274 static InverseUCATableHeader *assembleInverseTable(UErrorCode *status)
275 {
276 InverseUCATableHeader *result = NULL;
277 uint32_t headerByteSize = paddedsize(sizeof(InverseUCATableHeader));
278 uint32_t inverseTableByteSize = (inversePos+2)*sizeof(uint32_t)*3;
279 uint32_t contsByteSize = sContPos * sizeof(UChar);
280 uint32_t i = 0;
281
282 result = (InverseUCATableHeader *)uprv_malloc(headerByteSize + inverseTableByteSize + contsByteSize);
283 if(result != NULL) {
284 result->byteSize = headerByteSize + inverseTableByteSize + contsByteSize;
285
286 inversePos++;
287 inverseTable[inversePos][0] = 0xFFFFFFFF;
288 inverseTable[inversePos][1] = 0xFFFFFFFF;
289 inverseTable[inversePos][2] = 0x0000FFFF;
290 inversePos++;
291
292 for(i = 2; i<inversePos; i++) {
293 if(inverseTable[i-1][0] > inverseTable[i][0]) {
294 fprintf(stderr, "Error at %i: %08X & %08X\n", i, inverseTable[i-1][0], inverseTable[i][0]);
295 } else if(inverseTable[i-1][0] == inverseTable[i][0] && !(inverseTable[i-1][1] < inverseTable[i][1])) {
296 fprintf(stderr, "Continuation error at %i: %08X %08X & %08X %08X\n", i, inverseTable[i-1][0], inverseTable[i-1][1], inverseTable[i][0], inverseTable[i][1]);
297 }
298 }
299
300 result->tableSize = inversePos;
301 result->contsSize = sContPos;
302
303 result->table = headerByteSize;
304 result->conts = headerByteSize + inverseTableByteSize;
305
306 memcpy((uint8_t *)result + result->table, inverseTable, inverseTableByteSize);
307 memcpy((uint8_t *)result + result->conts, stringContinue, contsByteSize);
308
309 } else {
310 *status = U_MEMORY_ALLOCATION_ERROR;
311 return NULL;
312 }
313
314 return result;
315 }
316
317
318 static void writeOutInverseData(InverseUCATableHeader *data,
319 const char *outputDir,
320 const char *copyright,
321 UErrorCode *status)
322 {
323 UNewDataMemory *pData;
324
325 long dataLength;
326
327 UDataInfo invUcaInfo;
328 uprv_memcpy(&invUcaInfo, &invUcaDataInfo, sizeof(UDataInfo));
329 u_getUnicodeVersion(invUcaInfo.dataVersion);
330
331 pData=udata_create(outputDir, INVC_DATA_TYPE, U_ICUDATA_NAME "_" INVC_DATA_NAME, &invUcaInfo,
332 copyright, status);
333
334 if(U_FAILURE(*status)) {
335 fprintf(stderr, "Error: unable to create data memory, error %d\n", *status);
336 return;
337 }
338
339 /* write the data to the file */
340 if (VERBOSE) {
341 fprintf(stdout, "Writing out inverse UCA table: %s%c%s.%s\n", outputDir, U_FILE_SEP_CHAR,
342 U_ICUDATA_NAME "_" INVC_DATA_NAME,
343 INVC_DATA_TYPE);
344 }
345 udata_writeBlock(pData, data, data->byteSize);
346
347 /* finish up */
348 dataLength=udata_finish(pData, status);
349 if(U_FAILURE(*status)) {
350 fprintf(stderr, "Error: error %d writing the output file\n", *status);
351 return;
352 }
353 }
354
355
356
357 static int32_t hex2num(char hex) {
358 if(hex>='0' && hex <='9') {
359 return hex-'0';
360 } else if(hex>='a' && hex<='f') {
361 return hex-'a'+10;
362 } else if(hex>='A' && hex<='F') {
363 return hex-'A'+10;
364 } else {
365 return 0;
366 }
367 }
368
369 UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, UErrorCode *status) {
370 char buffer[2048], primary[100], secondary[100], tertiary[100];
371 UBool detectedContraction;
372 int32_t i = 0;
373 unsigned int theValue;
374 char *pointer = NULL;
375 char *commentStart = NULL;
376 char *startCodePoint = NULL;
377 char *endCodePoint = NULL;
378 char *spacePointer = NULL;
379 char *result = fgets(buffer, 2048, data);
380 int32_t buflen = uprv_strlen(buffer);
381 if(U_FAILURE(*status)) {
382 return 0;
383 }
384 *primary = *secondary = *tertiary = '\0';
385 if(result == NULL) {
386 if(feof(data)) {
387 return NULL;
388 } else {
389 fprintf(stderr, "empty line but no EOF!\n");
390 *status = U_INVALID_FORMAT_ERROR;
391 return NULL;
392 }
393 }
394 while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) {
395 buffer[--buflen] = 0;
396 }
397
398 if(buffer[0] == 0 || buffer[0] == '#') {
399 return NULL; // just a comment, skip whole line
400 }
401
402 UCAElements *element = &le; //(UCAElements *)malloc(sizeof(UCAElements));
403
404 enum ActionType {
405 READCE,
406 READHEX,
407 READUCAVERSION
408 };
409
410 // Directives.
411 if(buffer[0] == '[') {
412 uint32_t cnt = 0;
413 struct {
414 char name[256];
415 uint32_t *what;
416 ActionType what_to_do;
417 } vt[] = { {"[first tertiary ignorable", consts->UCA_FIRST_TERTIARY_IGNORABLE, READCE},
418 {"[last tertiary ignorable", consts->UCA_LAST_TERTIARY_IGNORABLE, READCE},
419 {"[first secondary ignorable", consts->UCA_FIRST_SECONDARY_IGNORABLE, READCE},
420 {"[last secondary ignorable", consts->UCA_LAST_SECONDARY_IGNORABLE, READCE},
421 {"[first primary ignorable", consts->UCA_FIRST_PRIMARY_IGNORABLE, READCE},
422 {"[last primary ignorable", consts->UCA_LAST_PRIMARY_IGNORABLE, READCE},
423 {"[first variable", consts->UCA_FIRST_VARIABLE, READCE},
424 {"[last variable", consts->UCA_LAST_VARIABLE, READCE},
425 {"[first regular", consts->UCA_FIRST_NON_VARIABLE, READCE},
426 {"[last regular", consts->UCA_LAST_NON_VARIABLE, READCE},
427 {"[first implicit", consts->UCA_FIRST_IMPLICIT, READCE},
428 {"[last implicit", consts->UCA_LAST_IMPLICIT, READCE},
429 {"[first trailing", consts->UCA_FIRST_TRAILING, READCE},
430 {"[last trailing", consts->UCA_LAST_TRAILING, READCE},
431
432 {"[fixed top", &consts->UCA_PRIMARY_TOP_MIN, READHEX},
433 {"[fixed first implicit byte", &consts->UCA_PRIMARY_IMPLICIT_MIN, READHEX},
434 {"[fixed last implicit byte", &consts->UCA_PRIMARY_IMPLICIT_MAX, READHEX},
435 {"[fixed first trail byte", &consts->UCA_PRIMARY_TRAILING_MIN, READHEX},
436 {"[fixed last trail byte", &consts->UCA_PRIMARY_TRAILING_MAX, READHEX},
437 {"[fixed first special byte", &consts->UCA_PRIMARY_SPECIAL_MIN, READHEX},
438 {"[fixed last special byte", &consts->UCA_PRIMARY_SPECIAL_MAX, READHEX},
439 {"[variable top = ", &t->options->variableTopValue, READHEX},
440 {"[UCA version = ", NULL, READUCAVERSION}
441 };
442 for (cnt = 0; cnt<sizeof(vt)/sizeof(vt[0]); cnt++) {
443 uint32_t vtLen = (uint32_t)uprv_strlen(vt[cnt].name);
444 if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
445 element->variableTop = TRUE;
446 if(vt[cnt].what_to_do == READHEX) {
447 if(sscanf(buffer+vtLen, "%4x", &theValue) != 1) /* read first code point */
448 {
449 fprintf(stderr, " scanf(hex) failed on !\n ");
450 }
451 *(vt[cnt].what) = (UChar)theValue;
452 //if(cnt == 1) { // first implicit
453 // we need to set the value for top next
454 //uint32_t nextTop = ucol_prv_calculateImplicitPrimary(0x4E00); // CJK base
455 //consts->UCA_NEXT_TOP_VALUE = theValue<<24 | 0x030303;
456 //}
457 } else if (vt[cnt].what_to_do == READCE) { /* vt[cnt].what_to_do == READCE */
458 pointer = strchr(buffer+vtLen, '[');
459 if(pointer) {
460 pointer++;
461 element->sizePrim[0]=readElement(&pointer, primary, ',', status);
462 element->sizeSec[0]=readElement(&pointer, secondary, ',', status);
463 element->sizeTer[0]=readElement(&pointer, tertiary, ']', status);
464
465 vt[cnt].what[0] = getSingleCEValue(primary, secondary, tertiary, status);
466 if(element->sizePrim[0] > 2 || element->sizeSec[0] > 1 || element->sizeTer[0] > 1) {
467 uint32_t CEi = 1;
468 uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
469 if(2*CEi<element->sizePrim[i]) {
470 value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
471 value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24);
472 }
473
474 if(2*CEi+1<element->sizePrim[i]) {
475 value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20);
476 value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16);
477 }
478
479 if(CEi<element->sizeSec[i]) {
480 value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12);
481 value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8);
482 }
483
484 if(CEi<element->sizeTer[i]) {
485 value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4);
486 value |= (hex2num(*(tertiary+2*CEi+1))&0xF);
487 }
488
489 CEi++;
490
491 vt[cnt].what[1] = value;
492 //element->CEs[CEindex++] = value;
493 } else {
494 vt[cnt].what[1] = 0;
495 }
496 } else {
497 fprintf(stderr, "Failed to read a CE from line %s\n", buffer);
498 }
499 } else { //vt[cnt].what_to_do == READUCAVERSION
500 u_versionFromString(UCAVersion, buffer+vtLen);
501 if(VERBOSE) {
502 fprintf(stdout, "UCA version [%hu.%hu.%hu.%hu]\n", UCAVersion[0], UCAVersion[1], UCAVersion[2], UCAVersion[3]);
503 }
504 }
505 //element->cPoints[0] = (UChar)theValue;
506 //return element;
507 return NULL;
508 }
509 }
510 fprintf(stderr, "Warning: unrecognized option: %s\n", buffer);
511 //*status = U_INVALID_FORMAT_ERROR;
512 return NULL;
513 }
514 element->variableTop = FALSE;
515
516 startCodePoint = buffer;
517 endCodePoint = strchr(startCodePoint, ';');
518
519 if(endCodePoint == 0) {
520 fprintf(stderr, "error - line with no code point!\n");
521 *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
522 return NULL;
523 } else {
524 *(endCodePoint) = 0;
525 }
526
527 if(element != NULL) {
528 memset(element, 0, sizeof(*element));
529 } else {
530 *status = U_MEMORY_ALLOCATION_ERROR;
531 return NULL;
532 }
533
534 element->cPoints = element->uchars;
535
536 spacePointer = strchr(buffer, ' ');
537 if(sscanf(buffer, "%4x", &theValue) != 1) /* read first code point */
538 {
539 fprintf(stderr, " scanf(hex) failed!\n ");
540 }
541 element->cPoints[0] = (UChar)theValue;
542
543 if(spacePointer == 0) {
544 detectedContraction = FALSE;
545 element->cSize = 1;
546 } else {
547 i = 1;
548 detectedContraction = TRUE;
549 while(spacePointer != NULL) {
550 sscanf(spacePointer+1, "%4x", &theValue);
551 element->cPoints[i++] = (UChar)theValue;
552 spacePointer = strchr(spacePointer+1, ' ');
553 }
554
555 element->cSize = i;
556
557 //fprintf(stderr, "Number of codepoints in contraction: %i\n", i);
558 }
559
560 startCodePoint = endCodePoint+1;
561
562 commentStart = strchr(startCodePoint, '#');
563 if(commentStart == NULL) {
564 commentStart = strlen(startCodePoint) + startCodePoint;
565 }
566
567 i = 0;
568 uint32_t CEindex = 0;
569 element->noOfCEs = 0;
570 for(;;) {
571 endCodePoint = strchr(startCodePoint, ']');
572 if(endCodePoint == NULL || endCodePoint >= commentStart) {
573 break;
574 }
575 pointer = strchr(startCodePoint, '[');
576 pointer++;
577
578 element->sizePrim[i]=readElement(&pointer, primary, ',', status);
579 element->sizeSec[i]=readElement(&pointer, secondary, ',', status);
580 element->sizeTer[i]=readElement(&pointer, tertiary, ']', status);
581
582
583 /* I want to get the CEs entered right here, including continuation */
584 element->CEs[CEindex++] = getSingleCEValue(primary, secondary, tertiary, status);
585
586 uint32_t CEi = 1;
587 while(2*CEi<element->sizePrim[i] || CEi<element->sizeSec[i] || CEi<element->sizeTer[i]) {
588 uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
589 if(2*CEi<element->sizePrim[i]) {
590 value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
591 value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24);
592 }
593
594 if(2*CEi+1<element->sizePrim[i]) {
595 value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20);
596 value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16);
597 }
598
599 if(CEi<element->sizeSec[i]) {
600 value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12);
601 value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8);
602 }
603
604 if(CEi<element->sizeTer[i]) {
605 value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4);
606 value |= (hex2num(*(tertiary+2*CEi+1))&0xF);
607 }
608
609 CEi++;
610
611 element->CEs[CEindex++] = value;
612 }
613
614 startCodePoint = endCodePoint+1;
615 i++;
616 }
617 element->noOfCEs = CEindex;
618
619 element->isThai = UCOL_ISTHAIPREVOWEL(element->cPoints[0]);
620
621 // we don't want any strange stuff after useful data!
622 while(pointer < commentStart) {
623 if(*pointer != ' ') {
624 *status=U_INVALID_FORMAT_ERROR;
625 break;
626 }
627 pointer++;
628 }
629
630 if(U_FAILURE(*status)) {
631 fprintf(stderr, "problem putting stuff in hash table\n");
632 *status = U_INTERNAL_PROGRAM_ERROR;
633 return NULL;
634 }
635
636 return element;
637 }
638
639
640 void writeOutData(UCATableHeader *data,
641 UCAConstants *consts,
642 UChar contractions[][3],
643 uint32_t noOfcontractions,
644 const char *outputDir,
645 const char *copyright,
646 UErrorCode *status)
647 {
648 if(U_FAILURE(*status)) {
649 return;
650 }
651
652 uint32_t size = data->size;
653
654 if(noOfcontractions != 0) {
655 contractions[noOfcontractions][0] = 0;
656 contractions[noOfcontractions][1] = 0;
657 contractions[noOfcontractions][2] = 0;
658 noOfcontractions++;
659
660
661 data->UCAConsts = data->size;
662 data->size += paddedsize(sizeof(UCAConstants));
663 data->contractionUCACombos = data->size;
664 data->size += paddedsize((noOfcontractions*3*sizeof(UChar)));
665 }
666
667 UNewDataMemory *pData;
668
669 long dataLength;
670 UDataInfo ucaInfo;
671 uprv_memcpy(&ucaInfo, &ucaDataInfo, sizeof(UDataInfo));
672 u_getUnicodeVersion(ucaInfo.dataVersion);
673
674 pData=udata_create(outputDir, UCA_DATA_TYPE, U_ICUDATA_NAME "_" UCA_DATA_NAME, &ucaInfo,
675 copyright, status);
676
677 if(U_FAILURE(*status)) {
678 fprintf(stderr, "Error: unable to create data memory, error %d\n", *status);
679 return;
680 }
681
682 /* write the data to the file */
683 if (VERBOSE) {
684 fprintf(stdout, "Writing out UCA table: %s%c%s.%s\n", outputDir,
685 U_FILE_SEP_CHAR,
686 U_ICUDATA_NAME "_" UCA_DATA_NAME,
687 UCA_DATA_TYPE);
688 }
689 udata_writeBlock(pData, data, size);
690
691 // output the constants here
692 udata_writeBlock(pData, consts, sizeof(UCAConstants));
693
694 if(noOfcontractions != 0) {
695 udata_writeBlock(pData, contractions, noOfcontractions*3*sizeof(UChar));
696 udata_writePadding(pData, paddedsize((noOfcontractions*3*sizeof(UChar))) - noOfcontractions*3*sizeof(uint16_t));
697 }
698
699 /* finish up */
700 dataLength=udata_finish(pData, status);
701 if(U_FAILURE(*status)) {
702 fprintf(stderr, "Error: error %d writing the output file\n", *status);
703 return;
704 }
705 }
706
707 static int32_t
708 write_uca_table(const char *filename,
709 const char *outputDir,
710 const char *copyright,
711 UErrorCode *status)
712 {
713 FILE *data = fopen(filename, "r");
714 uint32_t line = 0;
715 UCAElements *element = NULL;
716 UChar variableTopValue = 0;
717 UCATableHeader *myD = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
718 /* test for NULL */
719 if(myD == NULL) {
720 *status = U_MEMORY_ALLOCATION_ERROR;
721 fclose(data);
722 return 0;
723 }
724 UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
725 /* test for NULL */
726 if(opts == NULL) {
727 *status = U_MEMORY_ALLOCATION_ERROR;
728 uprv_free(myD);
729 fclose(data);
730 return 0;
731 }
732 UChar contractionCEs[256][3];
733 uint32_t noOfContractions = 0;
734 UCAConstants consts;
735 #if 0
736 UCAConstants consts = {
737 UCOL_RESET_TOP_VALUE,
738 UCOL_FIRST_PRIMARY_IGNORABLE,
739 UCOL_LAST_PRIMARY_IGNORABLE,
740 UCOL_LAST_PRIMARY_IGNORABLE_CONT,
741 UCOL_FIRST_SECONDARY_IGNORABLE,
742 UCOL_LAST_SECONDARY_IGNORABLE,
743 UCOL_FIRST_TERTIARY_IGNORABLE,
744 UCOL_LAST_TERTIARY_IGNORABLE,
745 UCOL_FIRST_VARIABLE,
746 UCOL_LAST_VARIABLE,
747 UCOL_FIRST_NON_VARIABLE,
748 UCOL_LAST_NON_VARIABLE,
749
750 UCOL_NEXT_TOP_VALUE,
751 /*
752 UCOL_NEXT_FIRST_PRIMARY_IGNORABLE,
753 UCOL_NEXT_LAST_PRIMARY_IGNORABLE,
754 UCOL_NEXT_FIRST_SECONDARY_IGNORABLE,
755 UCOL_NEXT_LAST_SECONDARY_IGNORABLE,
756 UCOL_NEXT_FIRST_TERTIARY_IGNORABLE,
757 UCOL_NEXT_LAST_TERTIARY_IGNORABLE,
758 UCOL_NEXT_FIRST_VARIABLE,
759 UCOL_NEXT_LAST_VARIABLE,
760 */
761
762 PRIMARY_IMPLICIT_MIN,
763 PRIMARY_IMPLICIT_MAX
764 };
765 #endif
766
767
768 if(data == NULL) {
769 fprintf(stderr, "Couldn't open file: %s\n", filename);
770 return -1;
771 }
772
773 memset(inverseTable, 0xDA, sizeof(int32_t)*3*0xFFFF);
774
775 opts->variableTopValue = variableTopValue;
776 opts->strength = UCOL_TERTIARY;
777 opts->frenchCollation = UCOL_OFF;
778 opts->alternateHandling = UCOL_NON_IGNORABLE; /* attribute for handling variable elements*/
779 opts->caseFirst = UCOL_OFF; /* who goes first, lower case or uppercase */
780 opts->caseLevel = UCOL_OFF; /* do we have an extra case level */
781 opts->normalizationMode = UCOL_OFF; /* attribute for normalization */
782 opts->hiraganaQ = UCOL_OFF; /* attribute for JIS X 4061, used only in Japanese */
783 myD->jamoSpecial = FALSE;
784
785 tempUCATable *t = uprv_uca_initTempTable(myD, opts, NULL, IMPLICIT_TAG, status);
786 if(U_FAILURE(*status))
787 {
788 fprintf(stderr, "Failed to init UCA temp table: %s\n", u_errorName(*status));
789 return -1;
790 }
791
792 #if 0
793 IMPLICIT_TAG = 9,
794 /*
795 *****************************************************************************************
796 * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF
797 ******************************************************************************************
798 */
799 #endif
800
801 // * set to zero
802 struct {
803 UChar32 start;
804 UChar32 end;
805 int32_t value;
806 } ranges[] =
807 {
808 #if 0
809 {0xAC00, 0xD7AF, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
810 {0xD800, 0xDBFF, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/
811 {0xDC00, 0xDFFF, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) }, //2 TRAIL_SURROGATE DC00-DFFF
812 {0x3400, 0x4DB5, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/
813 {0x4E00, 0x9FA5, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/
814 {0xF900, 0xFA2D, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/
815 {0x20000, 0x2A6D6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/
816 {0x2F800, 0x2FA1D, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/
817 #endif
818 {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
819 {0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/
820 {0xDC00, 0xE000, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) }, //2 TRAIL_SURROGATE DC00-DFFF
821 // Now directly handled in the collation code by the swapCJK function.
822 //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/
823 //{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/
824 //{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/
825 //{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/
826 //{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/
827 };
828 uint32_t i = 0;
829
830 for(i = 0; i<sizeof(ranges)/sizeof(ranges[0]); i++) {
831 /*ucmpe32_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value); */
832 utrie_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value, TRUE);
833 }
834
835
836 int32_t surrogateCount = 0;
837 while(!feof(data)) {
838 if(U_FAILURE(*status)) {
839 fprintf(stderr, "Something returned an error %i (%s) while processing line %i of %s. Exiting...\n",
840 *status, u_errorName(*status), line, filename);
841 exit(*status);
842 }
843
844 element = readAnElement(data, t, &consts, status);
845 line++;
846 if(VERBOSE) {
847 fprintf(stdout, "%i ", line);
848 }
849 if(element != NULL) {
850 // we have read the line, now do something sensible with the read data!
851
852 // Below stuff was taken care of in readAnElement
853 //if(element->variableTop == TRUE && variableTopValue == 0) {
854 // t->options->variableTopValue = element->cPoints[0];
855 //}
856
857 // if element is a contraction, we want to add it to contractions
858 if(element->cSize > 1 && element->cPoints[0] != 0xFDD0) { // this is a contraction
859 if(UTF_IS_LEAD(element->cPoints[0]) && UTF_IS_TRAIL(element->cPoints[1]) && element->cSize == 2) {
860 surrogateCount++;
861 } else {
862 contractionCEs[noOfContractions][0] = element->cPoints[0];
863 contractionCEs[noOfContractions][1] = element->cPoints[1];
864 if(element->cSize > 2) { // the third one
865 contractionCEs[noOfContractions][2] = element->cPoints[2];
866 } else {
867 contractionCEs[noOfContractions][2] = 0;
868 }
869 noOfContractions++;
870 }
871 }
872
873 /* we're first adding to inverse, because addAnElement will reverse the order */
874 /* of code points and stuff... we don't want that to happen */
875 addToInverse(element, status);
876 if(!(element->cSize > 1 && element->cPoints[0] == 0xFDD0)) {
877 uprv_uca_addAnElement(t, element, status);
878 }
879 }
880 }
881
882 if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) {
883 fprintf(stderr, "UCA version not specified. Cannot create data file!\n");
884 return -1;
885 }
886
887
888 if (VERBOSE) {
889 fprintf(stdout, "\nLines read: %i\n", line);
890 fprintf(stdout, "Surrogate count: %i\n", surrogateCount);
891 fprintf(stdout, "Raw data breakdown:\n");
892 /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
893 fprintf(stdout, "Number of contractions: %i\n", noOfContractions);
894 fprintf(stdout, "Contraction image size: %i\n", t->image->contractionSize);
895 fprintf(stdout, "Expansions size: %i\n", t->expansions->position);
896 }
897
898
899 /* produce canonical closure for table */
900 /* first set up constants for implicit calculation */
901 uprv_uca_initImplicitConstants(consts.UCA_PRIMARY_IMPLICIT_MIN);
902 /* do the closure */
903 int32_t noOfClosures = uprv_uca_canonicalClosure(t, status);
904 if(noOfClosures != 0) {
905 fprintf(stderr, "Warning: %i canonical closures occured!\n", noOfClosures);
906 }
907
908 /* test */
909 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
910
911 if (VERBOSE) {
912 fprintf(stdout, "Compacted data breakdown:\n");
913 /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
914 fprintf(stdout, "Number of contractions: %i\n", noOfContractions);
915 fprintf(stdout, "Contraction image size: %i\n", t->image->contractionSize);
916 fprintf(stdout, "Expansions size: %i\n", t->expansions->position);
917 }
918
919 /* populate the version info struct with version info*/
920 myData->version[0] = UCOL_BUILDER_VERSION;
921 myData->version[1] = UCAVersion[0];
922 myData->version[2] = UCAVersion[1];
923 myData->version[3] = UCAVersion[2];
924 /*TODO:The fractional rules version should be taken from FractionalUCA.txt*/
925 // Removed this macro. Instead, we use the fields below
926 //myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION;
927 //myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt
928 uprv_memcpy(myData->UCAVersion, UCAVersion, sizeof(UVersionInfo));
929 u_getUnicodeVersion(myData->UCDVersion);
930
931 writeOutData(myData, &consts, contractionCEs, noOfContractions, outputDir, copyright, status);
932
933 InverseUCATableHeader *inverse = assembleInverseTable(status);
934 uprv_memcpy(inverse->UCAVersion, UCAVersion, sizeof(UVersionInfo));
935 writeOutInverseData(inverse, outputDir, copyright, status);
936
937 uprv_uca_closeTempTable(t);
938 uprv_free(myD);
939 uprv_free(opts);
940
941
942 uprv_free(myData);
943 uprv_free(inverse);
944 fclose(data);
945
946 return 0;
947 }
948
949 #endif /* #if !UCONFIG_NO_COLLATION */
950
951 static UOption options[]={
952 UOPTION_HELP_H, /* 0 Numbers for those who*/
953 UOPTION_HELP_QUESTION_MARK, /* 1 can't count. */
954 UOPTION_COPYRIGHT, /* 2 */
955 UOPTION_VERSION, /* 3 */
956 UOPTION_DESTDIR, /* 4 */
957 UOPTION_SOURCEDIR, /* 5 */
958 UOPTION_VERBOSE, /* 6 */
959 UOPTION_ICUDATADIR /* 7 */
960 /* weiv can't count :))))) */
961 };
962
963 int main(int argc, char* argv[]) {
964 UErrorCode status = U_ZERO_ERROR;
965 const char* destdir = NULL;
966 const char* srcDir = NULL;
967 char filename[300];
968 char *basename = NULL;
969 const char *copyright = NULL;
970 uprv_memset(&UCAVersion, 0, 4);
971
972 U_MAIN_INIT_ARGS(argc, argv);
973
974 /* preset then read command line options */
975 options[4].value=u_getDataDirectory();
976 options[5].value="";
977 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
978
979 /* error handling, printing usage message */
980 if(argc<0) {
981 fprintf(stderr,
982 "error in command line argument \"%s\"\n",
983 argv[-argc]);
984 } else if(argc<2) {
985 argc=-1;
986 }
987 if(options[0].doesOccur || options[1].doesOccur) {
988 fprintf(stderr,
989 "usage: %s [-options] file\n"
990 "\tRead in UCA collation text data and write out the binary collation data\n"
991 "options:\n"
992 "\t-h or -? or --help this usage text\n"
993 "\t-V or --version show a version message\n"
994 "\t-c or --copyright include a copyright notice\n"
995 "\t-d or --destdir destination directory, followed by the path\n"
996 "\t-s or --sourcedir source directory, followed by the path\n"
997 "\t-v or --verbose turn on verbose output\n"
998 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
999 "\t followed by path, defaults to %s\n",
1000 argv[0], u_getDataDirectory());
1001 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1002 }
1003
1004 if(options[3].doesOccur) {
1005 fprintf(stdout, "genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n",
1006 #if UCONFIG_NO_COLLATION
1007 0, 0
1008 #else
1009 ucaDataInfo.formatVersion[0], ucaDataInfo.formatVersion[1]
1010 #endif
1011 );
1012 fprintf(stdout, "Copyright (C) 2000-2001, International Business Machines\n");
1013 fprintf(stdout, "Corporation and others. All Rights Reserved.\n");
1014 exit(0);
1015 }
1016
1017 /* get the options values */
1018 destdir = options[4].value;
1019 srcDir = options[5].value;
1020 VERBOSE = options[6].doesOccur;
1021
1022 if (options[2].doesOccur) {
1023 copyright = U_COPYRIGHT_STRING;
1024 }
1025
1026 if (options[7].doesOccur) {
1027 u_setDataDirectory(options[7].value);
1028 }
1029
1030 /* prepare the filename beginning with the source dir */
1031 uprv_strcpy(filename, srcDir);
1032 basename=filename+uprv_strlen(filename);
1033
1034 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
1035 *basename++ = U_FILE_SEP_CHAR;
1036 }
1037
1038 if(argc < 0) {
1039 uprv_strcpy(basename, "FractionalUCA.txt");
1040 } else {
1041 argv++;
1042 uprv_strcpy(basename, getLongPathname(*argv));
1043 }
1044
1045 #if 0
1046 if(u_getCombiningClass(0x0053) == 0)
1047 {
1048 fprintf(stderr, "SEVERE ERROR: Normalization data is not functioning! Bailing out. Was not able to load unorm.dat.\n");
1049 exit(1);
1050 }
1051 #endif
1052
1053 #if UCONFIG_NO_COLLATION
1054
1055 UNewDataMemory *pData;
1056 const char *msg;
1057
1058 msg = "genuca writes dummy " U_ICUDATA_NAME "_" UCA_DATA_NAME "." UCA_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h";
1059 fprintf(stderr, "%s\n", msg);
1060 pData = udata_create(destdir, UCA_DATA_TYPE, U_ICUDATA_NAME "_" UCA_DATA_NAME, &dummyDataInfo,
1061 NULL, &status);
1062 udata_writeBlock(pData, msg, strlen(msg));
1063 udata_finish(pData, &status);
1064
1065 msg = "genuca writes dummy " U_ICUDATA_NAME "_" INVC_DATA_NAME "." INVC_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h";
1066 fprintf(stderr, "%s\n", msg);
1067 pData = udata_create(destdir, INVC_DATA_TYPE, U_ICUDATA_NAME "_" INVC_DATA_NAME, &dummyDataInfo,
1068 NULL, &status);
1069 udata_writeBlock(pData, msg, strlen(msg));
1070 udata_finish(pData, &status);
1071
1072 return (int)status;
1073
1074 #else
1075
1076 return write_uca_table(filename, destdir, copyright, &status);
1077
1078 #endif
1079 }
1080
1081 /*
1082 * Hey, Emacs, please set the following:
1083 *
1084 * Local Variables:
1085 * indent-tabs-mode: nil
1086 * End:
1087 *
1088 */