2 *******************************************************************************
4 * Copyright (C) 2000-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: genuca.cpp
10 * tab size: 8 (not used)
13 * created at the end of XX century
14 * created by: Vladimir Weinstein
16 * This program reads the Franctional UCA table and generates
17 * internal format for UCA table as well as inverse UCA table.
18 * It then writes binary files containing the data: ucadata.dat
21 * 02/23/2001 grhoten Made it into a tool
22 * 02/23/2001 weiv Moved element & table handling code to i18n
23 * 05/09/2001 weiv Case bits are now in the CEs, not in front
27 #include "unicode/utypes.h"
28 #include "unicode/udata.h"
40 UBool VERBOSE
= FALSE
;
42 static UVersionInfo UCAVersion
;
44 #if UCONFIG_NO_COLLATION
46 /* dummy UDataInfo cf. udata.h */
47 static UDataInfo dummyDataInfo
= {
56 { 0, 0, 0, 0 }, /* dummy dataFormat */
57 { 0, 0, 0, 0 }, /* dummy formatVersion */
58 { 0, 0, 0, 0 } /* dummy dataVersion */
65 int32_t readElement(char **from
, char *to
, char separator
, UErrorCode
*status
) {
66 if(U_FAILURE(*status
)) {
71 while(**from
!= separator
) {
73 *(buffer
+i
++) = **from
;
79 //*to = (char *)malloc(strlen(buffer)+1);
85 uint32_t getSingleCEValue(char *primary
, char *secondary
, char *tertiary
, UErrorCode
*status
) {
86 if(U_FAILURE(*status
)) {
93 char *primend
= primary
+4;
94 if(strlen(primary
) > 4) {
98 char *secend
= secondary
+2;
99 if(strlen(secondary
) > 2) {
103 char *terend
= tertiary
+2;
104 if(strlen(tertiary
) > 2) {
108 uint32_t primvalue
= (uint32_t)((*primary
!='\0')?strtoul(primary
, &primend
, 16):0);
109 uint32_t secvalue
= (uint32_t)((*secondary
!='\0')?strtoul(secondary
, &secend
, 16):0);
110 uint32_t tervalue
= (uint32_t)((*tertiary
!='\0')?strtoul(tertiary
, &terend
, 16):0);
111 if(primvalue
<= 0xFF) {
115 value
= ((primvalue
<<UCOL_PRIMARYORDERSHIFT
)&UCOL_PRIMARYORDERMASK
)|
116 ((secvalue
<<UCOL_SECONDARYORDERSHIFT
)&UCOL_SECONDARYORDERMASK
)|
117 (tervalue
&UCOL_TERTIARYORDERMASK
);
131 static uint32_t inverseTable
[0xFFFF][3];
132 static uint32_t inversePos
= 0;
133 static UChar stringContinue
[0xFFFF];
134 static uint32_t sContPos
= 0;
136 static void addNewInverse(UCAElements
*element
, UErrorCode
*status
) {
137 if(U_FAILURE(*status
)) {
140 if(VERBOSE
&& isContinuation(element
->CEs
[1])) {
141 //fprintf(stdout, "+");
144 inverseTable
[inversePos
][0] = element
->CEs
[0];
145 if(element
->noOfCEs
> 1 && isContinuation(element
->CEs
[1])) {
146 inverseTable
[inversePos
][1] = element
->CEs
[1];
148 inverseTable
[inversePos
][1] = 0;
150 if(element
->cSize
< 2) {
151 inverseTable
[inversePos
][2] = element
->cPoints
[0];
152 } else { /* add a new store of cruft */
153 inverseTable
[inversePos
][2] = ((element
->cSize
+1) << UCOL_INV_SHIFTVALUE
) | sContPos
;
154 memcpy(stringContinue
+sContPos
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
155 sContPos
+= element
->cSize
+1;
159 static void insertInverse(UCAElements
*element
, uint32_t position
, UErrorCode
*status
) {
160 if(U_FAILURE(*status
)) {
164 if(VERBOSE
&& isContinuation(element
->CEs
[1])) {
165 //fprintf(stdout, "+");
167 if(position
<= inversePos
) {
168 /*move stuff around */
169 uint32_t amountToMove
= (inversePos
- position
+1)*sizeof(inverseTable
[0]);
170 uprv_memmove(inverseTable
[position
+1], inverseTable
[position
], amountToMove
);
172 inverseTable
[position
][0] = element
->CEs
[0];
173 if(element
->noOfCEs
> 1 && isContinuation(element
->CEs
[1])) {
174 inverseTable
[position
][1] = element
->CEs
[1];
176 inverseTable
[position
][1] = 0;
178 if(element
->cSize
< 2) {
179 inverseTable
[position
][2] = element
->cPoints
[0];
180 } else { /* add a new store of cruft */
181 inverseTable
[position
][2] = ((element
->cSize
+1) << UCOL_INV_SHIFTVALUE
) | sContPos
;
182 memcpy(stringContinue
+sContPos
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
183 sContPos
+= element
->cSize
+1;
188 static void addToExistingInverse(UCAElements
*element
, uint32_t position
, UErrorCode
*status
) {
190 if(U_FAILURE(*status
)) {
194 if((inverseTable
[position
][2] & UCOL_INV_SIZEMASK
) == 0) { /* single element, have to make new extension place and put both guys there */
195 stringContinue
[sContPos
] = (UChar
)inverseTable
[position
][2];
196 inverseTable
[position
][2] = ((element
->cSize
+3) << UCOL_INV_SHIFTVALUE
) | sContPos
;
198 stringContinue
[sContPos
++] = 0xFFFF;
199 memcpy(stringContinue
+sContPos
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
200 sContPos
+= element
->cSize
;
201 stringContinue
[sContPos
++] = 0xFFFE;
202 } else { /* adding to the already existing continuing table */
203 uint32_t contIndex
= inverseTable
[position
][2] & UCOL_INV_OFFSETMASK
;
204 uint32_t contSize
= (inverseTable
[position
][2] & UCOL_INV_SIZEMASK
) >> UCOL_INV_SHIFTVALUE
;
206 if(contIndex
+contSize
< sContPos
) {
207 /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/
208 memcpy(stringContinue
+contIndex
+contSize
+element
->cSize
+1, stringContinue
+contIndex
+contSize
, (element
->cSize
+1)*sizeof(UChar
));
211 stringContinue
[contIndex
+contSize
-1] = 0xFFFF;
212 memcpy(stringContinue
+contIndex
+contSize
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
213 sContPos
+= element
->cSize
+1;
214 stringContinue
[contIndex
+contSize
+element
->cSize
] = 0xFFFE;
216 inverseTable
[position
][2] = ((contSize
+element
->cSize
+1) << UCOL_INV_SHIFTVALUE
) | contIndex
;
220 static uint32_t addToInverse(UCAElements
*element
, UErrorCode
*status
) {
222 uint32_t position
= inversePos
;
223 uint32_t saveElement
= element
->CEs
[0];
224 element
->CEs
[0] &= 0xFFFFFF3F;
225 if(element
->noOfCEs
== 1) {
228 if(inversePos
== 0) {
229 inverseTable
[0][0] = inverseTable
[0][1] = inverseTable
[0][2] = 0;
230 addNewInverse(element
, status
);
231 } else if(inverseTable
[inversePos
][0] > element
->CEs
[0]) {
232 while(inverseTable
[--position
][0] > element
->CEs
[0]) {}
233 if(VERBOSE
) { fprintf(stdout
, "p:%i ", position
); }
234 if(inverseTable
[position
][0] == element
->CEs
[0]) {
235 if(isContinuation(element
->CEs
[1])) {
236 comp
= element
->CEs
[1];
240 if(inverseTable
[position
][1] > comp
) {
241 while(inverseTable
[--position
][1] > comp
) {}
243 if(inverseTable
[position
][1] == comp
) {
244 addToExistingInverse(element
, position
, status
);
246 insertInverse(element
, position
+1, status
);
249 if(VERBOSE
) { fprintf(stdout
, "ins"); }
250 insertInverse(element
, position
+1, status
);
252 } else if(inverseTable
[inversePos
][0] == element
->CEs
[0]) {
253 if(element
->noOfCEs
> 1 && isContinuation(element
->CEs
[1])) {
254 comp
= element
->CEs
[1];
255 if(inverseTable
[position
][1] > comp
) {
256 while(inverseTable
[--position
][1] > comp
) {}
258 if(inverseTable
[position
][1] == comp
) {
259 addToExistingInverse(element
, position
, status
);
261 insertInverse(element
, position
+1, status
);
264 addToExistingInverse(element
, inversePos
, status
);
267 addNewInverse(element
, status
);
269 element
->CEs
[0] = saveElement
;
270 if(VERBOSE
) { fprintf(stdout
, "+"); }
274 static InverseUCATableHeader
*assembleInverseTable(UErrorCode
*status
)
276 InverseUCATableHeader
*result
= NULL
;
277 uint32_t headerByteSize
= paddedsize(sizeof(InverseUCATableHeader
));
278 uint32_t inverseTableByteSize
= (inversePos
+2)*sizeof(uint32_t)*3;
279 uint32_t contsByteSize
= sContPos
* sizeof(UChar
);
282 result
= (InverseUCATableHeader
*)uprv_malloc(headerByteSize
+ inverseTableByteSize
+ contsByteSize
);
284 result
->byteSize
= headerByteSize
+ inverseTableByteSize
+ contsByteSize
;
287 inverseTable
[inversePos
][0] = 0xFFFFFFFF;
288 inverseTable
[inversePos
][1] = 0xFFFFFFFF;
289 inverseTable
[inversePos
][2] = 0x0000FFFF;
292 for(i
= 2; i
<inversePos
; i
++) {
293 if(inverseTable
[i
-1][0] > inverseTable
[i
][0]) {
294 fprintf(stderr
, "Error at %i: %08X & %08X\n", i
, inverseTable
[i
-1][0], inverseTable
[i
][0]);
295 } else if(inverseTable
[i
-1][0] == inverseTable
[i
][0] && !(inverseTable
[i
-1][1] < inverseTable
[i
][1])) {
296 fprintf(stderr
, "Continuation error at %i: %08X %08X & %08X %08X\n", i
, inverseTable
[i
-1][0], inverseTable
[i
-1][1], inverseTable
[i
][0], inverseTable
[i
][1]);
300 result
->tableSize
= inversePos
;
301 result
->contsSize
= sContPos
;
303 result
->table
= headerByteSize
;
304 result
->conts
= headerByteSize
+ inverseTableByteSize
;
306 memcpy((uint8_t *)result
+ result
->table
, inverseTable
, inverseTableByteSize
);
307 memcpy((uint8_t *)result
+ result
->conts
, stringContinue
, contsByteSize
);
310 *status
= U_MEMORY_ALLOCATION_ERROR
;
318 static void writeOutInverseData(InverseUCATableHeader
*data
,
319 const char *outputDir
,
320 const char *copyright
,
323 UNewDataMemory
*pData
;
327 UDataInfo invUcaInfo
;
328 uprv_memcpy(&invUcaInfo
, &invUcaDataInfo
, sizeof(UDataInfo
));
329 u_getUnicodeVersion(invUcaInfo
.dataVersion
);
331 pData
=udata_create(outputDir
, INVC_DATA_TYPE
, U_ICUDATA_NAME
"_" INVC_DATA_NAME
, &invUcaInfo
,
334 if(U_FAILURE(*status
)) {
335 fprintf(stderr
, "Error: unable to create data memory, error %d\n", *status
);
339 /* write the data to the file */
341 fprintf(stdout
, "Writing out inverse UCA table: %s%c%s.%s\n", outputDir
, U_FILE_SEP_CHAR
,
342 U_ICUDATA_NAME
"_" INVC_DATA_NAME
,
345 udata_writeBlock(pData
, data
, data
->byteSize
);
348 dataLength
=udata_finish(pData
, status
);
349 if(U_FAILURE(*status
)) {
350 fprintf(stderr
, "Error: error %d writing the output file\n", *status
);
357 static int32_t hex2num(char hex
) {
358 if(hex
>='0' && hex
<='9') {
360 } else if(hex
>='a' && hex
<='f') {
362 } else if(hex
>='A' && hex
<='F') {
369 UCAElements
*readAnElement(FILE *data
, tempUCATable
*t
, UCAConstants
*consts
, UErrorCode
*status
) {
370 char buffer
[2048], primary
[100], secondary
[100], tertiary
[100];
371 UBool detectedContraction
;
373 unsigned int theValue
;
374 char *pointer
= NULL
;
375 char *commentStart
= NULL
;
376 char *startCodePoint
= NULL
;
377 char *endCodePoint
= NULL
;
378 char *spacePointer
= NULL
;
379 char *result
= fgets(buffer
, 2048, data
);
380 int32_t buflen
= uprv_strlen(buffer
);
381 if(U_FAILURE(*status
)) {
384 *primary
= *secondary
= *tertiary
= '\0';
389 fprintf(stderr
, "empty line but no EOF!\n");
390 *status
= U_INVALID_FORMAT_ERROR
;
394 while(buflen
>0 && (buffer
[buflen
-1] == '\r' || buffer
[buflen
-1] == '\n')) {
395 buffer
[--buflen
] = 0;
398 if(buffer
[0] == 0 || buffer
[0] == '#') {
399 return NULL
; // just a comment, skip whole line
402 UCAElements
*element
= &le
; //(UCAElements *)malloc(sizeof(UCAElements));
411 if(buffer
[0] == '[') {
416 ActionType what_to_do
;
417 } vt
[] = { {"[first tertiary ignorable", consts
->UCA_FIRST_TERTIARY_IGNORABLE
, READCE
},
418 {"[last tertiary ignorable", consts
->UCA_LAST_TERTIARY_IGNORABLE
, READCE
},
419 {"[first secondary ignorable", consts
->UCA_FIRST_SECONDARY_IGNORABLE
, READCE
},
420 {"[last secondary ignorable", consts
->UCA_LAST_SECONDARY_IGNORABLE
, READCE
},
421 {"[first primary ignorable", consts
->UCA_FIRST_PRIMARY_IGNORABLE
, READCE
},
422 {"[last primary ignorable", consts
->UCA_LAST_PRIMARY_IGNORABLE
, READCE
},
423 {"[first variable", consts
->UCA_FIRST_VARIABLE
, READCE
},
424 {"[last variable", consts
->UCA_LAST_VARIABLE
, READCE
},
425 {"[first regular", consts
->UCA_FIRST_NON_VARIABLE
, READCE
},
426 {"[last regular", consts
->UCA_LAST_NON_VARIABLE
, READCE
},
427 {"[first implicit", consts
->UCA_FIRST_IMPLICIT
, READCE
},
428 {"[last implicit", consts
->UCA_LAST_IMPLICIT
, READCE
},
429 {"[first trailing", consts
->UCA_FIRST_TRAILING
, READCE
},
430 {"[last trailing", consts
->UCA_LAST_TRAILING
, READCE
},
432 {"[fixed top", &consts
->UCA_PRIMARY_TOP_MIN
, READHEX
},
433 {"[fixed first implicit byte", &consts
->UCA_PRIMARY_IMPLICIT_MIN
, READHEX
},
434 {"[fixed last implicit byte", &consts
->UCA_PRIMARY_IMPLICIT_MAX
, READHEX
},
435 {"[fixed first trail byte", &consts
->UCA_PRIMARY_TRAILING_MIN
, READHEX
},
436 {"[fixed last trail byte", &consts
->UCA_PRIMARY_TRAILING_MAX
, READHEX
},
437 {"[fixed first special byte", &consts
->UCA_PRIMARY_SPECIAL_MIN
, READHEX
},
438 {"[fixed last special byte", &consts
->UCA_PRIMARY_SPECIAL_MAX
, READHEX
},
439 {"[variable top = ", &t
->options
->variableTopValue
, READHEX
},
440 {"[UCA version = ", NULL
, READUCAVERSION
}
442 for (cnt
= 0; cnt
<sizeof(vt
)/sizeof(vt
[0]); cnt
++) {
443 uint32_t vtLen
= (uint32_t)uprv_strlen(vt
[cnt
].name
);
444 if(uprv_strncmp(buffer
, vt
[cnt
].name
, vtLen
) == 0) {
445 element
->variableTop
= TRUE
;
446 if(vt
[cnt
].what_to_do
== READHEX
) {
447 if(sscanf(buffer
+vtLen
, "%4x", &theValue
) != 1) /* read first code point */
449 fprintf(stderr
, " scanf(hex) failed on !\n ");
451 *(vt
[cnt
].what
) = (UChar
)theValue
;
452 //if(cnt == 1) { // first implicit
453 // we need to set the value for top next
454 //uint32_t nextTop = ucol_prv_calculateImplicitPrimary(0x4E00); // CJK base
455 //consts->UCA_NEXT_TOP_VALUE = theValue<<24 | 0x030303;
457 } else if (vt
[cnt
].what_to_do
== READCE
) { /* vt[cnt].what_to_do == READCE */
458 pointer
= strchr(buffer
+vtLen
, '[');
461 element
->sizePrim
[0]=readElement(&pointer
, primary
, ',', status
);
462 element
->sizeSec
[0]=readElement(&pointer
, secondary
, ',', status
);
463 element
->sizeTer
[0]=readElement(&pointer
, tertiary
, ']', status
);
465 vt
[cnt
].what
[0] = getSingleCEValue(primary
, secondary
, tertiary
, status
);
466 if(element
->sizePrim
[0] > 2 || element
->sizeSec
[0] > 1 || element
->sizeTer
[0] > 1) {
468 uint32_t value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
469 if(2*CEi
<element
->sizePrim
[i
]) {
470 value
|= ((hex2num(*(primary
+4*CEi
))&0xF)<<28);
471 value
|= ((hex2num(*(primary
+4*CEi
+1))&0xF)<<24);
474 if(2*CEi
+1<element
->sizePrim
[i
]) {
475 value
|= ((hex2num(*(primary
+4*CEi
+2))&0xF)<<20);
476 value
|= ((hex2num(*(primary
+4*CEi
+3))&0xF)<<16);
479 if(CEi
<element
->sizeSec
[i
]) {
480 value
|= ((hex2num(*(secondary
+2*CEi
))&0xF)<<12);
481 value
|= ((hex2num(*(secondary
+2*CEi
+1))&0xF)<<8);
484 if(CEi
<element
->sizeTer
[i
]) {
485 value
|= ((hex2num(*(tertiary
+2*CEi
))&0x3)<<4);
486 value
|= (hex2num(*(tertiary
+2*CEi
+1))&0xF);
491 vt
[cnt
].what
[1] = value
;
492 //element->CEs[CEindex++] = value;
497 fprintf(stderr
, "Failed to read a CE from line %s\n", buffer
);
499 } else { //vt[cnt].what_to_do == READUCAVERSION
500 u_versionFromString(UCAVersion
, buffer
+vtLen
);
502 fprintf(stdout
, "UCA version [%hu.%hu.%hu.%hu]\n", UCAVersion
[0], UCAVersion
[1], UCAVersion
[2], UCAVersion
[3]);
505 //element->cPoints[0] = (UChar)theValue;
510 fprintf(stderr
, "Warning: unrecognized option: %s\n", buffer
);
511 //*status = U_INVALID_FORMAT_ERROR;
514 element
->variableTop
= FALSE
;
516 startCodePoint
= buffer
;
517 endCodePoint
= strchr(startCodePoint
, ';');
519 if(endCodePoint
== 0) {
520 fprintf(stderr
, "error - line with no code point!\n");
521 *status
= U_INVALID_FORMAT_ERROR
; /* No code point - could be an error, but probably only an empty line */
527 if(element
!= NULL
) {
528 memset(element
, 0, sizeof(*element
));
530 *status
= U_MEMORY_ALLOCATION_ERROR
;
534 element
->cPoints
= element
->uchars
;
536 spacePointer
= strchr(buffer
, ' ');
537 if(sscanf(buffer
, "%4x", &theValue
) != 1) /* read first code point */
539 fprintf(stderr
, " scanf(hex) failed!\n ");
541 element
->cPoints
[0] = (UChar
)theValue
;
543 if(spacePointer
== 0) {
544 detectedContraction
= FALSE
;
548 detectedContraction
= TRUE
;
549 while(spacePointer
!= NULL
) {
550 sscanf(spacePointer
+1, "%4x", &theValue
);
551 element
->cPoints
[i
++] = (UChar
)theValue
;
552 spacePointer
= strchr(spacePointer
+1, ' ');
557 //fprintf(stderr, "Number of codepoints in contraction: %i\n", i);
560 startCodePoint
= endCodePoint
+1;
562 commentStart
= strchr(startCodePoint
, '#');
563 if(commentStart
== NULL
) {
564 commentStart
= strlen(startCodePoint
) + startCodePoint
;
568 uint32_t CEindex
= 0;
569 element
->noOfCEs
= 0;
571 endCodePoint
= strchr(startCodePoint
, ']');
572 if(endCodePoint
== NULL
|| endCodePoint
>= commentStart
) {
575 pointer
= strchr(startCodePoint
, '[');
578 element
->sizePrim
[i
]=readElement(&pointer
, primary
, ',', status
);
579 element
->sizeSec
[i
]=readElement(&pointer
, secondary
, ',', status
);
580 element
->sizeTer
[i
]=readElement(&pointer
, tertiary
, ']', status
);
583 /* I want to get the CEs entered right here, including continuation */
584 element
->CEs
[CEindex
++] = getSingleCEValue(primary
, secondary
, tertiary
, status
);
587 while(2*CEi
<element
->sizePrim
[i
] || CEi
<element
->sizeSec
[i
] || CEi
<element
->sizeTer
[i
]) {
588 uint32_t value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
589 if(2*CEi
<element
->sizePrim
[i
]) {
590 value
|= ((hex2num(*(primary
+4*CEi
))&0xF)<<28);
591 value
|= ((hex2num(*(primary
+4*CEi
+1))&0xF)<<24);
594 if(2*CEi
+1<element
->sizePrim
[i
]) {
595 value
|= ((hex2num(*(primary
+4*CEi
+2))&0xF)<<20);
596 value
|= ((hex2num(*(primary
+4*CEi
+3))&0xF)<<16);
599 if(CEi
<element
->sizeSec
[i
]) {
600 value
|= ((hex2num(*(secondary
+2*CEi
))&0xF)<<12);
601 value
|= ((hex2num(*(secondary
+2*CEi
+1))&0xF)<<8);
604 if(CEi
<element
->sizeTer
[i
]) {
605 value
|= ((hex2num(*(tertiary
+2*CEi
))&0x3)<<4);
606 value
|= (hex2num(*(tertiary
+2*CEi
+1))&0xF);
611 element
->CEs
[CEindex
++] = value
;
614 startCodePoint
= endCodePoint
+1;
617 element
->noOfCEs
= CEindex
;
619 element
->isThai
= UCOL_ISTHAIPREVOWEL(element
->cPoints
[0]);
621 // we don't want any strange stuff after useful data!
622 while(pointer
< commentStart
) {
623 if(*pointer
!= ' ') {
624 *status
=U_INVALID_FORMAT_ERROR
;
630 if(U_FAILURE(*status
)) {
631 fprintf(stderr
, "problem putting stuff in hash table\n");
632 *status
= U_INTERNAL_PROGRAM_ERROR
;
640 void writeOutData(UCATableHeader
*data
,
641 UCAConstants
*consts
,
642 UChar contractions
[][3],
643 uint32_t noOfcontractions
,
644 const char *outputDir
,
645 const char *copyright
,
648 if(U_FAILURE(*status
)) {
652 uint32_t size
= data
->size
;
654 if(noOfcontractions
!= 0) {
655 contractions
[noOfcontractions
][0] = 0;
656 contractions
[noOfcontractions
][1] = 0;
657 contractions
[noOfcontractions
][2] = 0;
661 data
->UCAConsts
= data
->size
;
662 data
->size
+= paddedsize(sizeof(UCAConstants
));
663 data
->contractionUCACombos
= data
->size
;
664 data
->size
+= paddedsize((noOfcontractions
*3*sizeof(UChar
)));
667 UNewDataMemory
*pData
;
671 uprv_memcpy(&ucaInfo
, &ucaDataInfo
, sizeof(UDataInfo
));
672 u_getUnicodeVersion(ucaInfo
.dataVersion
);
674 pData
=udata_create(outputDir
, UCA_DATA_TYPE
, U_ICUDATA_NAME
"_" UCA_DATA_NAME
, &ucaInfo
,
677 if(U_FAILURE(*status
)) {
678 fprintf(stderr
, "Error: unable to create data memory, error %d\n", *status
);
682 /* write the data to the file */
684 fprintf(stdout
, "Writing out UCA table: %s%c%s.%s\n", outputDir
,
686 U_ICUDATA_NAME
"_" UCA_DATA_NAME
,
689 udata_writeBlock(pData
, data
, size
);
691 // output the constants here
692 udata_writeBlock(pData
, consts
, sizeof(UCAConstants
));
694 if(noOfcontractions
!= 0) {
695 udata_writeBlock(pData
, contractions
, noOfcontractions
*3*sizeof(UChar
));
696 udata_writePadding(pData
, paddedsize((noOfcontractions
*3*sizeof(UChar
))) - noOfcontractions
*3*sizeof(uint16_t));
700 dataLength
=udata_finish(pData
, status
);
701 if(U_FAILURE(*status
)) {
702 fprintf(stderr
, "Error: error %d writing the output file\n", *status
);
708 write_uca_table(const char *filename
,
709 const char *outputDir
,
710 const char *copyright
,
713 FILE *data
= fopen(filename
, "r");
715 UCAElements
*element
= NULL
;
716 UChar variableTopValue
= 0;
717 UCATableHeader
*myD
= (UCATableHeader
*)uprv_malloc(sizeof(UCATableHeader
));
720 *status
= U_MEMORY_ALLOCATION_ERROR
;
724 UColOptionSet
*opts
= (UColOptionSet
*)uprv_malloc(sizeof(UColOptionSet
));
727 *status
= U_MEMORY_ALLOCATION_ERROR
;
732 UChar contractionCEs
[256][3];
733 uint32_t noOfContractions
= 0;
736 UCAConstants consts
= {
737 UCOL_RESET_TOP_VALUE
,
738 UCOL_FIRST_PRIMARY_IGNORABLE
,
739 UCOL_LAST_PRIMARY_IGNORABLE
,
740 UCOL_LAST_PRIMARY_IGNORABLE_CONT
,
741 UCOL_FIRST_SECONDARY_IGNORABLE
,
742 UCOL_LAST_SECONDARY_IGNORABLE
,
743 UCOL_FIRST_TERTIARY_IGNORABLE
,
744 UCOL_LAST_TERTIARY_IGNORABLE
,
747 UCOL_FIRST_NON_VARIABLE
,
748 UCOL_LAST_NON_VARIABLE
,
752 UCOL_NEXT_FIRST_PRIMARY_IGNORABLE,
753 UCOL_NEXT_LAST_PRIMARY_IGNORABLE,
754 UCOL_NEXT_FIRST_SECONDARY_IGNORABLE,
755 UCOL_NEXT_LAST_SECONDARY_IGNORABLE,
756 UCOL_NEXT_FIRST_TERTIARY_IGNORABLE,
757 UCOL_NEXT_LAST_TERTIARY_IGNORABLE,
758 UCOL_NEXT_FIRST_VARIABLE,
759 UCOL_NEXT_LAST_VARIABLE,
762 PRIMARY_IMPLICIT_MIN
,
769 fprintf(stderr
, "Couldn't open file: %s\n", filename
);
773 memset(inverseTable
, 0xDA, sizeof(int32_t)*3*0xFFFF);
775 opts
->variableTopValue
= variableTopValue
;
776 opts
->strength
= UCOL_TERTIARY
;
777 opts
->frenchCollation
= UCOL_OFF
;
778 opts
->alternateHandling
= UCOL_NON_IGNORABLE
; /* attribute for handling variable elements*/
779 opts
->caseFirst
= UCOL_OFF
; /* who goes first, lower case or uppercase */
780 opts
->caseLevel
= UCOL_OFF
; /* do we have an extra case level */
781 opts
->normalizationMode
= UCOL_OFF
; /* attribute for normalization */
782 opts
->hiraganaQ
= UCOL_OFF
; /* attribute for JIS X 4061, used only in Japanese */
783 myD
->jamoSpecial
= FALSE
;
785 tempUCATable
*t
= uprv_uca_initTempTable(myD
, opts
, NULL
, IMPLICIT_TAG
, status
);
786 if(U_FAILURE(*status
))
788 fprintf(stderr
, "Failed to init UCA temp table: %s\n", u_errorName(*status
));
795 *****************************************************************************************
796 * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF
797 ******************************************************************************************
809 {0xAC00, 0xD7AF, UCOL_SPECIAL_FLAG
| (HANGUL_SYLLABLE_TAG
<< 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
810 {0xD800, 0xDBFF, UCOL_SPECIAL_FLAG
| (LEAD_SURROGATE_TAG
<< 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/
811 {0xDC00, 0xDFFF, UCOL_SPECIAL_FLAG
| (TRAIL_SURROGATE_TAG
<< 24) }, //2 TRAIL_SURROGATE DC00-DFFF
812 {0x3400, 0x4DB5, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/
813 {0x4E00, 0x9FA5, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/
814 {0xF900, 0xFA2D, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/
815 {0x20000, 0x2A6D6, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/
816 {0x2F800, 0x2FA1D, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/
818 {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG
| (HANGUL_SYLLABLE_TAG
<< 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
819 {0xD800, 0xDC00, UCOL_SPECIAL_FLAG
| (LEAD_SURROGATE_TAG
<< 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/
820 {0xDC00, 0xE000, UCOL_SPECIAL_FLAG
| (TRAIL_SURROGATE_TAG
<< 24) }, //2 TRAIL_SURROGATE DC00-DFFF
821 // Now directly handled in the collation code by the swapCJK function.
822 //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/
823 //{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/
824 //{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/
825 //{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/
826 //{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/
830 for(i
= 0; i
<sizeof(ranges
)/sizeof(ranges
[0]); i
++) {
831 /*ucmpe32_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value); */
832 utrie_setRange32(t
->mapping
, ranges
[i
].start
, ranges
[i
].end
, ranges
[i
].value
, TRUE
);
836 int32_t surrogateCount
= 0;
838 if(U_FAILURE(*status
)) {
839 fprintf(stderr
, "Something returned an error %i (%s) while processing line %i of %s. Exiting...\n",
840 *status
, u_errorName(*status
), line
, filename
);
844 element
= readAnElement(data
, t
, &consts
, status
);
847 fprintf(stdout
, "%i ", line
);
849 if(element
!= NULL
) {
850 // we have read the line, now do something sensible with the read data!
852 // Below stuff was taken care of in readAnElement
853 //if(element->variableTop == TRUE && variableTopValue == 0) {
854 // t->options->variableTopValue = element->cPoints[0];
857 // if element is a contraction, we want to add it to contractions
858 if(element
->cSize
> 1 && element
->cPoints
[0] != 0xFDD0) { // this is a contraction
859 if(UTF_IS_LEAD(element
->cPoints
[0]) && UTF_IS_TRAIL(element
->cPoints
[1]) && element
->cSize
== 2) {
862 contractionCEs
[noOfContractions
][0] = element
->cPoints
[0];
863 contractionCEs
[noOfContractions
][1] = element
->cPoints
[1];
864 if(element
->cSize
> 2) { // the third one
865 contractionCEs
[noOfContractions
][2] = element
->cPoints
[2];
867 contractionCEs
[noOfContractions
][2] = 0;
873 /* we're first adding to inverse, because addAnElement will reverse the order */
874 /* of code points and stuff... we don't want that to happen */
875 addToInverse(element
, status
);
876 if(!(element
->cSize
> 1 && element
->cPoints
[0] == 0xFDD0)) {
877 uprv_uca_addAnElement(t
, element
, status
);
882 if(UCAVersion
[0] == 0 && UCAVersion
[1] == 0 && UCAVersion
[2] == 0 && UCAVersion
[3] == 0) {
883 fprintf(stderr
, "UCA version not specified. Cannot create data file!\n");
889 fprintf(stdout
, "\nLines read: %i\n", line
);
890 fprintf(stdout
, "Surrogate count: %i\n", surrogateCount
);
891 fprintf(stdout
, "Raw data breakdown:\n");
892 /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
893 fprintf(stdout
, "Number of contractions: %i\n", noOfContractions
);
894 fprintf(stdout
, "Contraction image size: %i\n", t
->image
->contractionSize
);
895 fprintf(stdout
, "Expansions size: %i\n", t
->expansions
->position
);
899 /* produce canonical closure for table */
900 /* first set up constants for implicit calculation */
901 uprv_uca_initImplicitConstants(consts
.UCA_PRIMARY_IMPLICIT_MIN
);
903 int32_t noOfClosures
= uprv_uca_canonicalClosure(t
, status
);
904 if(noOfClosures
!= 0) {
905 fprintf(stderr
, "Warning: %i canonical closures occured!\n", noOfClosures
);
909 UCATableHeader
*myData
= uprv_uca_assembleTable(t
, status
);
912 fprintf(stdout
, "Compacted data breakdown:\n");
913 /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
914 fprintf(stdout
, "Number of contractions: %i\n", noOfContractions
);
915 fprintf(stdout
, "Contraction image size: %i\n", t
->image
->contractionSize
);
916 fprintf(stdout
, "Expansions size: %i\n", t
->expansions
->position
);
919 /* populate the version info struct with version info*/
920 myData
->version
[0] = UCOL_BUILDER_VERSION
;
921 myData
->version
[1] = UCAVersion
[0];
922 myData
->version
[2] = UCAVersion
[1];
923 myData
->version
[3] = UCAVersion
[2];
924 /*TODO:The fractional rules version should be taken from FractionalUCA.txt*/
925 // Removed this macro. Instead, we use the fields below
926 //myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION;
927 //myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt
928 uprv_memcpy(myData
->UCAVersion
, UCAVersion
, sizeof(UVersionInfo
));
929 u_getUnicodeVersion(myData
->UCDVersion
);
931 writeOutData(myData
, &consts
, contractionCEs
, noOfContractions
, outputDir
, copyright
, status
);
933 InverseUCATableHeader
*inverse
= assembleInverseTable(status
);
934 uprv_memcpy(inverse
->UCAVersion
, UCAVersion
, sizeof(UVersionInfo
));
935 writeOutInverseData(inverse
, outputDir
, copyright
, status
);
937 uprv_uca_closeTempTable(t
);
949 #endif /* #if !UCONFIG_NO_COLLATION */
951 static UOption options
[]={
952 UOPTION_HELP_H
, /* 0 Numbers for those who*/
953 UOPTION_HELP_QUESTION_MARK
, /* 1 can't count. */
954 UOPTION_COPYRIGHT
, /* 2 */
955 UOPTION_VERSION
, /* 3 */
956 UOPTION_DESTDIR
, /* 4 */
957 UOPTION_SOURCEDIR
, /* 5 */
958 UOPTION_VERBOSE
, /* 6 */
959 UOPTION_ICUDATADIR
/* 7 */
960 /* weiv can't count :))))) */
963 int main(int argc
, char* argv
[]) {
964 UErrorCode status
= U_ZERO_ERROR
;
965 const char* destdir
= NULL
;
966 const char* srcDir
= NULL
;
968 char *basename
= NULL
;
969 const char *copyright
= NULL
;
970 uprv_memset(&UCAVersion
, 0, 4);
972 U_MAIN_INIT_ARGS(argc
, argv
);
974 /* preset then read command line options */
975 options
[4].value
=u_getDataDirectory();
977 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
979 /* error handling, printing usage message */
982 "error in command line argument \"%s\"\n",
987 if(options
[0].doesOccur
|| options
[1].doesOccur
) {
989 "usage: %s [-options] file\n"
990 "\tRead in UCA collation text data and write out the binary collation data\n"
992 "\t-h or -? or --help this usage text\n"
993 "\t-V or --version show a version message\n"
994 "\t-c or --copyright include a copyright notice\n"
995 "\t-d or --destdir destination directory, followed by the path\n"
996 "\t-s or --sourcedir source directory, followed by the path\n"
997 "\t-v or --verbose turn on verbose output\n"
998 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
999 "\t followed by path, defaults to %s\n",
1000 argv
[0], u_getDataDirectory());
1001 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
1004 if(options
[3].doesOccur
) {
1005 fprintf(stdout
, "genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n",
1006 #if UCONFIG_NO_COLLATION
1009 ucaDataInfo
.formatVersion
[0], ucaDataInfo
.formatVersion
[1]
1012 fprintf(stdout
, "Copyright (C) 2000-2001, International Business Machines\n");
1013 fprintf(stdout
, "Corporation and others. All Rights Reserved.\n");
1017 /* get the options values */
1018 destdir
= options
[4].value
;
1019 srcDir
= options
[5].value
;
1020 VERBOSE
= options
[6].doesOccur
;
1022 if (options
[2].doesOccur
) {
1023 copyright
= U_COPYRIGHT_STRING
;
1026 if (options
[7].doesOccur
) {
1027 u_setDataDirectory(options
[7].value
);
1030 /* prepare the filename beginning with the source dir */
1031 uprv_strcpy(filename
, srcDir
);
1032 basename
=filename
+uprv_strlen(filename
);
1034 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
1035 *basename
++ = U_FILE_SEP_CHAR
;
1039 uprv_strcpy(basename
, "FractionalUCA.txt");
1042 uprv_strcpy(basename
, getLongPathname(*argv
));
1046 if(u_getCombiningClass(0x0053) == 0)
1048 fprintf(stderr
, "SEVERE ERROR: Normalization data is not functioning! Bailing out. Was not able to load unorm.dat.\n");
1053 #if UCONFIG_NO_COLLATION
1055 UNewDataMemory
*pData
;
1058 msg
= "genuca writes dummy " U_ICUDATA_NAME
"_" UCA_DATA_NAME
"." UCA_DATA_TYPE
" because of UCONFIG_NO_COLLATION, see uconfig.h";
1059 fprintf(stderr
, "%s\n", msg
);
1060 pData
= udata_create(destdir
, UCA_DATA_TYPE
, U_ICUDATA_NAME
"_" UCA_DATA_NAME
, &dummyDataInfo
,
1062 udata_writeBlock(pData
, msg
, strlen(msg
));
1063 udata_finish(pData
, &status
);
1065 msg
= "genuca writes dummy " U_ICUDATA_NAME
"_" INVC_DATA_NAME
"." INVC_DATA_TYPE
" because of UCONFIG_NO_COLLATION, see uconfig.h";
1066 fprintf(stderr
, "%s\n", msg
);
1067 pData
= udata_create(destdir
, INVC_DATA_TYPE
, U_ICUDATA_NAME
"_" INVC_DATA_NAME
, &dummyDataInfo
,
1069 udata_writeBlock(pData
, msg
, strlen(msg
));
1070 udata_finish(pData
, &status
);
1076 return write_uca_table(filename
, destdir
, copyright
, &status
);
1082 * Hey, Emacs, please set the following:
1085 * indent-tabs-mode: nil