2 *******************************************************************************
4 * Copyright (C) 2000-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: genuca.cpp
10 * tab size: 8 (not used)
13 * created at the end of XX century
14 * created by: Vladimir Weinstein
16 * This program reads the Franctional UCA table and generates
17 * internal format for UCA table as well as inverse UCA table.
18 * It then writes binary files containing the data: ucadata.dat
21 * 02/23/2001 grhoten Made it into a tool
22 * 02/23/2001 weiv Moved element & table handling code to i18n
23 * 05/09/2001 weiv Case bits are now in the CEs, not in front
26 #include "unicode/utypes.h"
27 #include "unicode/putil.h"
28 #include "unicode/udata.h"
29 #include "unicode/uclean.h"
43 UBool VERBOSE
= FALSE
;
45 static UVersionInfo UCAVersion
;
47 #if UCONFIG_NO_COLLATION
49 /* dummy UDataInfo cf. udata.h */
50 static UDataInfo dummyDataInfo
= {
59 { 0, 0, 0, 0 }, /* dummy dataFormat */
60 { 0, 0, 0, 0 }, /* dummy formatVersion */
61 { 0, 0, 0, 0 } /* dummy dataVersion */
66 static const UDataInfo ucaDataInfo
={
75 {UCA_DATA_FORMAT_0
, UCA_DATA_FORMAT_1
, UCA_DATA_FORMAT_2
, UCA_DATA_FORMAT_3
}, /* dataFormat="UCol" */
76 /* 03/26/2002 bumped up version since format has changed */
77 /* 09/16/2002 bumped up version since we went from UColAttributeValue */
78 /* to int32_t in UColOptionSet */
79 /* 05/13/2003 This one also updated since we added UCA and UCD versions */
81 /* 09/11/2003 Adding information required by data swapper */
82 {UCA_FORMAT_VERSION_0
, UCA_FORMAT_VERSION_1
, UCA_FORMAT_VERSION_2
, UCA_FORMAT_VERSION_3
}, /* formatVersion */
83 {0, 0, 0, 0} /* dataVersion = Unicode Version*/
86 static const UDataInfo invUcaDataInfo
={
95 {INVUCA_DATA_FORMAT_0
, INVUCA_DATA_FORMAT_1
, INVUCA_DATA_FORMAT_2
, INVUCA_DATA_FORMAT_3
}, /* dataFormat="InvC" */
96 /* 03/26/2002 bumped up version since format has changed */
97 /* 04/29/2003 2.1 format - we have added UCA version to header */
98 {INVUCA_FORMAT_VERSION_0
, INVUCA_FORMAT_VERSION_1
, INVUCA_FORMAT_VERSION_2
, INVUCA_FORMAT_VERSION_3
}, /* formatVersion */
99 {0, 0, 0, 0} /* dataVersion = Unicode Version*/
104 int32_t readElement(char **from
, char *to
, char separator
, UErrorCode
*status
) {
105 if(U_FAILURE(*status
)) {
110 while(**from
!= separator
) {
112 *(buffer
+i
++) = **from
;
118 //*to = (char *)malloc(strlen(buffer)+1);
124 uint32_t getSingleCEValue(char *primary
, char *secondary
, char *tertiary
, UErrorCode
*status
) {
125 if(U_FAILURE(*status
)) {
129 char primsave
= '\0';
132 char *primend
= primary
+4;
133 if(strlen(primary
) > 4) {
137 char *secend
= secondary
+2;
138 if(strlen(secondary
) > 2) {
142 char *terend
= tertiary
+2;
143 if(strlen(tertiary
) > 2) {
147 uint32_t primvalue
= (uint32_t)((*primary
!='\0')?strtoul(primary
, &primend
, 16):0);
148 uint32_t secvalue
= (uint32_t)((*secondary
!='\0')?strtoul(secondary
, &secend
, 16):0);
149 uint32_t tervalue
= (uint32_t)((*tertiary
!='\0')?strtoul(tertiary
, &terend
, 16):0);
150 if(primvalue
<= 0xFF) {
154 value
= ((primvalue
<<UCOL_PRIMARYORDERSHIFT
)&UCOL_PRIMARYORDERMASK
)|
155 ((secvalue
<<UCOL_SECONDARYORDERSHIFT
)&UCOL_SECONDARYORDERMASK
)|
156 (tervalue
&UCOL_TERTIARYORDERMASK
);
170 static uint32_t inverseTable
[0xFFFF][3];
171 static uint32_t inversePos
= 0;
172 static UChar stringContinue
[0xFFFF];
173 static uint32_t sContPos
= 0;
175 static void addNewInverse(UCAElements
*element
, UErrorCode
*status
) {
176 if(U_FAILURE(*status
)) {
179 if(VERBOSE
&& isContinuation(element
->CEs
[1])) {
180 //fprintf(stdout, "+");
183 inverseTable
[inversePos
][0] = element
->CEs
[0];
184 if(element
->noOfCEs
> 1 && isContinuation(element
->CEs
[1])) {
185 inverseTable
[inversePos
][1] = element
->CEs
[1];
187 inverseTable
[inversePos
][1] = 0;
189 if(element
->cSize
< 2) {
190 inverseTable
[inversePos
][2] = element
->cPoints
[0];
191 } else { /* add a new store of cruft */
192 inverseTable
[inversePos
][2] = ((element
->cSize
+1) << UCOL_INV_SHIFTVALUE
) | sContPos
;
193 memcpy(stringContinue
+sContPos
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
194 sContPos
+= element
->cSize
+1;
198 static void insertInverse(UCAElements
*element
, uint32_t position
, UErrorCode
*status
) {
199 if(U_FAILURE(*status
)) {
203 if(VERBOSE
&& isContinuation(element
->CEs
[1])) {
204 //fprintf(stdout, "+");
206 if(position
<= inversePos
) {
207 /*move stuff around */
208 uint32_t amountToMove
= (inversePos
- position
+1)*sizeof(inverseTable
[0]);
209 uprv_memmove(inverseTable
[position
+1], inverseTable
[position
], amountToMove
);
211 inverseTable
[position
][0] = element
->CEs
[0];
212 if(element
->noOfCEs
> 1 && isContinuation(element
->CEs
[1])) {
213 inverseTable
[position
][1] = element
->CEs
[1];
215 inverseTable
[position
][1] = 0;
217 if(element
->cSize
< 2) {
218 inverseTable
[position
][2] = element
->cPoints
[0];
219 } else { /* add a new store of cruft */
220 inverseTable
[position
][2] = ((element
->cSize
+1) << UCOL_INV_SHIFTVALUE
) | sContPos
;
221 memcpy(stringContinue
+sContPos
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
222 sContPos
+= element
->cSize
+1;
227 static void addToExistingInverse(UCAElements
*element
, uint32_t position
, UErrorCode
*status
) {
229 if(U_FAILURE(*status
)) {
233 if((inverseTable
[position
][2] & UCOL_INV_SIZEMASK
) == 0) { /* single element, have to make new extension place and put both guys there */
234 stringContinue
[sContPos
] = (UChar
)inverseTable
[position
][2];
235 inverseTable
[position
][2] = ((element
->cSize
+3) << UCOL_INV_SHIFTVALUE
) | sContPos
;
237 stringContinue
[sContPos
++] = 0xFFFF;
238 memcpy(stringContinue
+sContPos
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
239 sContPos
+= element
->cSize
;
240 stringContinue
[sContPos
++] = 0xFFFE;
241 } else { /* adding to the already existing continuing table */
242 uint32_t contIndex
= inverseTable
[position
][2] & UCOL_INV_OFFSETMASK
;
243 uint32_t contSize
= (inverseTable
[position
][2] & UCOL_INV_SIZEMASK
) >> UCOL_INV_SHIFTVALUE
;
245 if(contIndex
+contSize
< sContPos
) {
246 /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/
247 memcpy(stringContinue
+contIndex
+contSize
+element
->cSize
+1, stringContinue
+contIndex
+contSize
, (element
->cSize
+1)*sizeof(UChar
));
250 stringContinue
[contIndex
+contSize
-1] = 0xFFFF;
251 memcpy(stringContinue
+contIndex
+contSize
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
252 sContPos
+= element
->cSize
+1;
253 stringContinue
[contIndex
+contSize
+element
->cSize
] = 0xFFFE;
255 inverseTable
[position
][2] = ((contSize
+element
->cSize
+1) << UCOL_INV_SHIFTVALUE
) | contIndex
;
260 * Takes two CEs (lead and continuation) and
261 * compares them as CEs should be compared:
262 * primary vs. primary, secondary vs. secondary
263 * tertiary vs. tertiary
265 static int32_t compareCEs(uint32_t *source
, uint32_t *target
) {
266 uint32_t s1
= source
[0], s2
, t1
= target
[0], t2
;
267 if(isContinuation(source
[1])) {
272 if(isContinuation(target
[1])) {
278 uint32_t s
= 0, t
= 0;
279 if(s1
== t1
&& s2
== t2
) {
282 s
= (s1
& 0xFFFF0000)|((s2
& 0xFFFF0000)>>16);
283 t
= (t1
& 0xFFFF0000)|((t2
& 0xFFFF0000)>>16);
289 s
= (s1
& 0x0000FF00) | (s2
& 0x0000FF00)>>8;
290 t
= (t1
& 0x0000FF00) | (t2
& 0x0000FF00)>>8;
296 s
= (s1
& 0x000000FF)<<8 | (s2
& 0x000000FF);
297 t
= (t1
& 0x000000FF)<<8 | (t2
& 0x000000FF);
307 static uint32_t addToInverse(UCAElements
*element
, UErrorCode
*status
) {
308 uint32_t position
= inversePos
;
309 uint32_t saveElement
= element
->CEs
[0];
310 int32_t compResult
= 0;
311 element
->CEs
[0] &= 0xFFFFFF3F;
312 if(element
->noOfCEs
== 1) {
315 if(inversePos
== 0) {
316 inverseTable
[0][0] = inverseTable
[0][1] = inverseTable
[0][2] = 0;
317 addNewInverse(element
, status
);
318 } else if(compareCEs(inverseTable
[inversePos
], element
->CEs
) > 0) {
319 while((compResult
= compareCEs(inverseTable
[--position
], element
->CEs
)) > 0);
320 if(VERBOSE
) { fprintf(stdout
, "p:%u ", (int)position
); }
321 if(compResult
== 0) {
322 addToExistingInverse(element
, position
, status
);
324 insertInverse(element
, position
+1, status
);
326 } else if(compareCEs(inverseTable
[inversePos
], element
->CEs
) == 0) {
327 addToExistingInverse(element
, inversePos
, status
);
329 addNewInverse(element
, status
);
331 element
->CEs
[0] = saveElement
;
332 if(VERBOSE
) { fprintf(stdout
, "+"); }
336 static InverseUCATableHeader
*assembleInverseTable(UErrorCode
*status
)
338 InverseUCATableHeader
*result
= NULL
;
339 uint32_t headerByteSize
= paddedsize(sizeof(InverseUCATableHeader
));
340 uint32_t inverseTableByteSize
= (inversePos
+2)*sizeof(uint32_t)*3;
341 uint32_t contsByteSize
= sContPos
* sizeof(UChar
);
344 result
= (InverseUCATableHeader
*)uprv_malloc(headerByteSize
+ inverseTableByteSize
+ contsByteSize
);
345 uprv_memset(result
, 0, headerByteSize
+ inverseTableByteSize
+ contsByteSize
);
347 result
->byteSize
= headerByteSize
+ inverseTableByteSize
+ contsByteSize
;
350 inverseTable
[inversePos
][0] = 0xFFFFFFFF;
351 inverseTable
[inversePos
][1] = 0xFFFFFFFF;
352 inverseTable
[inversePos
][2] = 0x0000FFFF;
355 for(i
= 2; i
<inversePos
; i
++) {
356 if(compareCEs(inverseTable
[i
-1], inverseTable
[i
]) > 0) {
357 fprintf(stderr
, "Error at %i: %08X & %08X\n", (int)i
, (int)inverseTable
[i
-1][0], (int)inverseTable
[i
][0]);
358 } else if(inverseTable
[i
-1][0] == inverseTable
[i
][0] && !(inverseTable
[i
-1][1] < inverseTable
[i
][1])) {
359 fprintf(stderr
, "Continuation error at %i: %08X %08X & %08X %08X\n", (int)i
, (int)inverseTable
[i
-1][0], (int)inverseTable
[i
-1][1], (int)inverseTable
[i
][0], (int)inverseTable
[i
][1]);
363 result
->tableSize
= inversePos
;
364 result
->contsSize
= sContPos
;
366 result
->table
= headerByteSize
;
367 result
->conts
= headerByteSize
+ inverseTableByteSize
;
369 memcpy((uint8_t *)result
+ result
->table
, inverseTable
, inverseTableByteSize
);
370 memcpy((uint8_t *)result
+ result
->conts
, stringContinue
, contsByteSize
);
373 *status
= U_MEMORY_ALLOCATION_ERROR
;
381 static void writeOutInverseData(InverseUCATableHeader
*data
,
382 const char *outputDir
,
383 const char *copyright
,
386 UNewDataMemory
*pData
;
390 UDataInfo invUcaInfo
;
391 uprv_memcpy(&invUcaInfo
, &invUcaDataInfo
, sizeof(UDataInfo
));
392 u_getUnicodeVersion(invUcaInfo
.dataVersion
);
394 pData
=udata_create(outputDir
, INVC_DATA_TYPE
, INVC_DATA_NAME
, &invUcaInfo
,
397 if(U_FAILURE(*status
)) {
398 fprintf(stderr
, "Error: unable to create %s"INVC_DATA_NAME
", error %s\n", outputDir
, u_errorName(*status
));
402 /* write the data to the file */
404 fprintf(stdout
, "Writing out inverse UCA table: %s%c%s.%s\n", outputDir
, U_FILE_SEP_CHAR
,
408 udata_writeBlock(pData
, data
, data
->byteSize
);
411 dataLength
=udata_finish(pData
, status
);
412 if(U_FAILURE(*status
)) {
413 fprintf(stderr
, "Error: error %d writing the output file\n", *status
);
420 static int32_t hex2num(char hex
) {
421 if(hex
>='0' && hex
<='9') {
423 } else if(hex
>='a' && hex
<='f') {
425 } else if(hex
>='A' && hex
<='F') {
432 UCAElements
*readAnElement(FILE *data
, tempUCATable
*t
, UCAConstants
*consts
, UErrorCode
*status
) {
433 char buffer
[2048], primary
[100], secondary
[100], tertiary
[100];
434 UBool detectedContraction
;
436 unsigned int theValue
;
437 char *pointer
= NULL
;
438 char *commentStart
= NULL
;
439 char *startCodePoint
= NULL
;
440 char *endCodePoint
= NULL
;
441 char *spacePointer
= NULL
;
442 char *dashPointer
= NULL
;
443 char *result
= fgets(buffer
, 2048, data
);
444 int32_t buflen
= (int32_t)uprv_strlen(buffer
);
445 if(U_FAILURE(*status
)) {
448 *primary
= *secondary
= *tertiary
= '\0';
453 fprintf(stderr
, "empty line but no EOF!\n");
454 *status
= U_INVALID_FORMAT_ERROR
;
458 while(buflen
>0 && (buffer
[buflen
-1] == '\r' || buffer
[buflen
-1] == '\n')) {
459 buffer
[--buflen
] = 0;
462 if(buffer
[0] == 0 || buffer
[0] == '#') {
463 return NULL
; // just a comment, skip whole line
466 UCAElements
*element
= &le
; //(UCAElements *)malloc(sizeof(UCAElements));
475 if(buffer
[0] == '[') {
477 static const struct {
480 ActionType what_to_do
;
481 } vt
[] = { {"[first tertiary ignorable", consts
->UCA_FIRST_TERTIARY_IGNORABLE
, READCE
},
482 {"[last tertiary ignorable", consts
->UCA_LAST_TERTIARY_IGNORABLE
, READCE
},
483 {"[first secondary ignorable", consts
->UCA_FIRST_SECONDARY_IGNORABLE
, READCE
},
484 {"[last secondary ignorable", consts
->UCA_LAST_SECONDARY_IGNORABLE
, READCE
},
485 {"[first primary ignorable", consts
->UCA_FIRST_PRIMARY_IGNORABLE
, READCE
},
486 {"[last primary ignorable", consts
->UCA_LAST_PRIMARY_IGNORABLE
, READCE
},
487 {"[first variable", consts
->UCA_FIRST_VARIABLE
, READCE
},
488 {"[last variable", consts
->UCA_LAST_VARIABLE
, READCE
},
489 {"[first regular", consts
->UCA_FIRST_NON_VARIABLE
, READCE
},
490 {"[last regular", consts
->UCA_LAST_NON_VARIABLE
, READCE
},
491 {"[first implicit", consts
->UCA_FIRST_IMPLICIT
, READCE
},
492 {"[last implicit", consts
->UCA_LAST_IMPLICIT
, READCE
},
493 {"[first trailing", consts
->UCA_FIRST_TRAILING
, READCE
},
494 {"[last trailing", consts
->UCA_LAST_TRAILING
, READCE
},
496 {"[fixed top", &consts
->UCA_PRIMARY_TOP_MIN
, READHEX
},
497 {"[fixed first implicit byte", &consts
->UCA_PRIMARY_IMPLICIT_MIN
, READHEX
},
498 {"[fixed last implicit byte", &consts
->UCA_PRIMARY_IMPLICIT_MAX
, READHEX
},
499 {"[fixed first trail byte", &consts
->UCA_PRIMARY_TRAILING_MIN
, READHEX
},
500 {"[fixed last trail byte", &consts
->UCA_PRIMARY_TRAILING_MAX
, READHEX
},
501 {"[fixed first special byte", &consts
->UCA_PRIMARY_SPECIAL_MIN
, READHEX
},
502 {"[fixed last special byte", &consts
->UCA_PRIMARY_SPECIAL_MAX
, READHEX
},
503 {"[variable top = ", &t
->options
->variableTopValue
, READHEX
},
504 {"[UCA version = ", NULL
, READUCAVERSION
}
506 for (cnt
= 0; cnt
<sizeof(vt
)/sizeof(vt
[0]); cnt
++) {
507 uint32_t vtLen
= (uint32_t)uprv_strlen(vt
[cnt
].name
);
508 if(uprv_strncmp(buffer
, vt
[cnt
].name
, vtLen
) == 0) {
509 element
->variableTop
= TRUE
;
510 if(vt
[cnt
].what_to_do
== READHEX
) {
511 if(sscanf(buffer
+vtLen
, "%4x", &theValue
) != 1) /* read first code point */
513 fprintf(stderr
, " scanf(hex) failed on !\n ");
515 *(vt
[cnt
].what
) = (UChar
)theValue
;
516 //if(cnt == 1) { // first implicit
517 // we need to set the value for top next
518 //uint32_t nextTop = ucol_prv_calculateImplicitPrimary(0x4E00); // CJK base
519 //consts->UCA_NEXT_TOP_VALUE = theValue<<24 | 0x030303;
521 } else if (vt
[cnt
].what_to_do
== READCE
) { /* vt[cnt].what_to_do == READCE */
522 pointer
= strchr(buffer
+vtLen
, '[');
525 element
->sizePrim
[0]=readElement(&pointer
, primary
, ',', status
);
526 element
->sizeSec
[0]=readElement(&pointer
, secondary
, ',', status
);
527 element
->sizeTer
[0]=readElement(&pointer
, tertiary
, ']', status
);
529 vt
[cnt
].what
[0] = getSingleCEValue(primary
, secondary
, tertiary
, status
);
530 if(element
->sizePrim
[0] > 2 || element
->sizeSec
[0] > 1 || element
->sizeTer
[0] > 1) {
532 uint32_t value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
533 if(2*CEi
<element
->sizePrim
[i
]) {
534 value
|= ((hex2num(*(primary
+4*CEi
))&0xF)<<28);
535 value
|= ((hex2num(*(primary
+4*CEi
+1))&0xF)<<24);
538 if(2*CEi
+1<element
->sizePrim
[i
]) {
539 value
|= ((hex2num(*(primary
+4*CEi
+2))&0xF)<<20);
540 value
|= ((hex2num(*(primary
+4*CEi
+3))&0xF)<<16);
543 if(CEi
<element
->sizeSec
[i
]) {
544 value
|= ((hex2num(*(secondary
+2*CEi
))&0xF)<<12);
545 value
|= ((hex2num(*(secondary
+2*CEi
+1))&0xF)<<8);
548 if(CEi
<element
->sizeTer
[i
]) {
549 value
|= ((hex2num(*(tertiary
+2*CEi
))&0x3)<<4);
550 value
|= (hex2num(*(tertiary
+2*CEi
+1))&0xF);
555 vt
[cnt
].what
[1] = value
;
556 //element->CEs[CEindex++] = value;
561 fprintf(stderr
, "Failed to read a CE from line %s\n", buffer
);
563 } else { //vt[cnt].what_to_do == READUCAVERSION
564 u_versionFromString(UCAVersion
, buffer
+vtLen
);
566 fprintf(stdout
, "UCA version [%hu.%hu.%hu.%hu]\n", UCAVersion
[0], UCAVersion
[1], UCAVersion
[2], UCAVersion
[3]);
569 //element->cPoints[0] = (UChar)theValue;
574 fprintf(stderr
, "Warning: unrecognized option: %s\n", buffer
);
575 //*status = U_INVALID_FORMAT_ERROR;
578 element
->variableTop
= FALSE
;
580 startCodePoint
= buffer
;
581 endCodePoint
= strchr(startCodePoint
, ';');
583 if(endCodePoint
== 0) {
584 fprintf(stderr
, "error - line with no code point!\n");
585 *status
= U_INVALID_FORMAT_ERROR
; /* No code point - could be an error, but probably only an empty line */
591 memset(element
, 0, sizeof(*element
));
593 element
->cPoints
= element
->uchars
;
595 spacePointer
= strchr(buffer
, ' ');
596 if(sscanf(buffer
, "%4x", &theValue
) != 1) /* read first code point */
598 fprintf(stderr
, " scanf(hex) failed!\n ");
600 element
->cPoints
[0] = (UChar
)theValue
;
602 if(spacePointer
== 0) {
603 detectedContraction
= FALSE
;
606 dashPointer
= strchr(buffer
, '|');
607 if (dashPointer
!= NULL
) {
609 element
->prefixChars
[0] = (UChar
)theValue
;
610 element
->prefixSize
= 1;
611 element
->prefix
= element
->prefixChars
;
612 sscanf(dashPointer
+1, "%4x", &theValue
);
613 element
->cPoints
[0] = (UChar
)theValue
;
617 // Contractions or surrogate characters.
619 detectedContraction
= TRUE
;
620 while(spacePointer
!= NULL
) {
621 sscanf(spacePointer
+1, "%4x", &theValue
);
622 element
->cPoints
[i
++] = (UChar
)theValue
;
623 spacePointer
= strchr(spacePointer
+1, ' ');
629 //fprintf(stderr, "Number of codepoints in contraction: %i\n", i);
632 startCodePoint
= endCodePoint
+1;
634 commentStart
= strchr(startCodePoint
, '#');
635 if(commentStart
== NULL
) {
636 commentStart
= strlen(startCodePoint
) + startCodePoint
;
640 uint32_t CEindex
= 0;
641 element
->noOfCEs
= 0;
643 endCodePoint
= strchr(startCodePoint
, ']');
644 if(endCodePoint
== NULL
|| endCodePoint
>= commentStart
) {
647 pointer
= strchr(startCodePoint
, '[');
650 element
->sizePrim
[i
]=readElement(&pointer
, primary
, ',', status
);
651 element
->sizeSec
[i
]=readElement(&pointer
, secondary
, ',', status
);
652 element
->sizeTer
[i
]=readElement(&pointer
, tertiary
, ']', status
);
655 /* I want to get the CEs entered right here, including continuation */
656 element
->CEs
[CEindex
++] = getSingleCEValue(primary
, secondary
, tertiary
, status
);
659 while(2*CEi
<element
->sizePrim
[i
] || CEi
<element
->sizeSec
[i
] || CEi
<element
->sizeTer
[i
]) {
660 uint32_t value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
661 if(2*CEi
<element
->sizePrim
[i
]) {
662 value
|= ((hex2num(*(primary
+4*CEi
))&0xF)<<28);
663 value
|= ((hex2num(*(primary
+4*CEi
+1))&0xF)<<24);
666 if(2*CEi
+1<element
->sizePrim
[i
]) {
667 value
|= ((hex2num(*(primary
+4*CEi
+2))&0xF)<<20);
668 value
|= ((hex2num(*(primary
+4*CEi
+3))&0xF)<<16);
671 if(CEi
<element
->sizeSec
[i
]) {
672 value
|= ((hex2num(*(secondary
+2*CEi
))&0xF)<<12);
673 value
|= ((hex2num(*(secondary
+2*CEi
+1))&0xF)<<8);
676 if(CEi
<element
->sizeTer
[i
]) {
677 value
|= ((hex2num(*(tertiary
+2*CEi
))&0x3)<<4);
678 value
|= (hex2num(*(tertiary
+2*CEi
+1))&0xF);
683 element
->CEs
[CEindex
++] = value
;
686 startCodePoint
= endCodePoint
+1;
689 element
->noOfCEs
= CEindex
;
691 element
->isThai
= UCOL_ISTHAIPREVOWEL(element
->cPoints
[0]);
693 // we don't want any strange stuff after useful data!
694 if (pointer
== NULL
) {
695 /* huh? Did we get ']' without the '['? Pair your brackets! */
696 *status
=U_INVALID_FORMAT_ERROR
;
699 while(pointer
< commentStart
) {
700 if(*pointer
!= ' ' && *pointer
!= '\t')
702 *status
=U_INVALID_FORMAT_ERROR
;
709 if(U_FAILURE(*status
)) {
710 fprintf(stderr
, "problem putting stuff in hash table %s\n", u_errorName(*status
));
711 *status
= U_INTERNAL_PROGRAM_ERROR
;
719 void writeOutData(UCATableHeader
*data
,
720 UCAConstants
*consts
,
721 UChar contractions
[][3],
722 uint32_t noOfcontractions
,
723 const char *outputDir
,
724 const char *copyright
,
727 if(U_FAILURE(*status
)) {
731 uint32_t size
= data
->size
;
733 data
->UCAConsts
= data
->size
;
734 data
->size
+= paddedsize(sizeof(UCAConstants
));
736 if(noOfcontractions
!= 0) {
737 contractions
[noOfcontractions
][0] = 0;
738 contractions
[noOfcontractions
][1] = 0;
739 contractions
[noOfcontractions
][2] = 0;
743 data
->contractionUCACombos
= data
->size
;
744 data
->contractionUCACombosWidth
= 3;
745 data
->contractionUCACombosSize
= noOfcontractions
;
746 data
->size
+= paddedsize((noOfcontractions
*3*sizeof(UChar
)));
749 UNewDataMemory
*pData
;
753 uprv_memcpy(&ucaInfo
, &ucaDataInfo
, sizeof(UDataInfo
));
754 u_getUnicodeVersion(ucaInfo
.dataVersion
);
756 pData
=udata_create(outputDir
, UCA_DATA_TYPE
, UCA_DATA_NAME
, &ucaInfo
,
759 if(U_FAILURE(*status
)) {
760 fprintf(stderr
, "Error: unable to create %s"UCA_DATA_NAME
", error %s\n", outputDir
, u_errorName(*status
));
764 /* write the data to the file */
766 fprintf(stdout
, "Writing out UCA table: %s%c%s.%s\n", outputDir
,
768 U_ICUDATA_NAME
"_" UCA_DATA_NAME
,
771 udata_writeBlock(pData
, data
, size
);
773 // output the constants here
774 udata_writeBlock(pData
, consts
, sizeof(UCAConstants
));
776 if(noOfcontractions
!= 0) {
777 udata_writeBlock(pData
, contractions
, noOfcontractions
*3*sizeof(UChar
));
778 udata_writePadding(pData
, paddedsize((noOfcontractions
*3*sizeof(UChar
))) - noOfcontractions
*3*sizeof(uint16_t));
782 dataLength
=udata_finish(pData
, status
);
783 if(U_FAILURE(*status
)) {
784 fprintf(stderr
, "Error: error %d writing the output file\n", *status
);
790 write_uca_table(const char *filename
,
791 const char *outputDir
,
792 const char *copyright
,
795 FILE *data
= fopen(filename
, "r");
797 fprintf(stderr
, "Couldn't open file: %s\n", filename
);
801 UCAElements
*element
= NULL
;
802 UChar variableTopValue
= 0;
803 UCATableHeader
*myD
= (UCATableHeader
*)uprv_malloc(sizeof(UCATableHeader
));
806 *status
= U_MEMORY_ALLOCATION_ERROR
;
810 uprv_memset(myD
, 0, sizeof(UCATableHeader
));
811 UColOptionSet
*opts
= (UColOptionSet
*)uprv_malloc(sizeof(UColOptionSet
));
814 *status
= U_MEMORY_ALLOCATION_ERROR
;
819 uprv_memset(opts
, 0, sizeof(UColOptionSet
));
820 UChar contractionCEs
[512][3];
821 uprv_memset(contractionCEs
, 0, 512*3*sizeof(UChar
));
822 uint32_t noOfContractions
= 0;
824 uprv_memset(&consts
, 0, sizeof(consts
));
826 UCAConstants consts
= {
827 UCOL_RESET_TOP_VALUE
,
828 UCOL_FIRST_PRIMARY_IGNORABLE
,
829 UCOL_LAST_PRIMARY_IGNORABLE
,
830 UCOL_LAST_PRIMARY_IGNORABLE_CONT
,
831 UCOL_FIRST_SECONDARY_IGNORABLE
,
832 UCOL_LAST_SECONDARY_IGNORABLE
,
833 UCOL_FIRST_TERTIARY_IGNORABLE
,
834 UCOL_LAST_TERTIARY_IGNORABLE
,
837 UCOL_FIRST_NON_VARIABLE
,
838 UCOL_LAST_NON_VARIABLE
,
842 UCOL_NEXT_FIRST_PRIMARY_IGNORABLE,
843 UCOL_NEXT_LAST_PRIMARY_IGNORABLE,
844 UCOL_NEXT_FIRST_SECONDARY_IGNORABLE,
845 UCOL_NEXT_LAST_SECONDARY_IGNORABLE,
846 UCOL_NEXT_FIRST_TERTIARY_IGNORABLE,
847 UCOL_NEXT_LAST_TERTIARY_IGNORABLE,
848 UCOL_NEXT_FIRST_VARIABLE,
849 UCOL_NEXT_LAST_VARIABLE,
852 PRIMARY_IMPLICIT_MIN
,
858 uprv_memset(inverseTable
, 0xDA, sizeof(int32_t)*3*0xFFFF);
860 opts
->variableTopValue
= variableTopValue
;
861 opts
->strength
= UCOL_TERTIARY
;
862 opts
->frenchCollation
= UCOL_OFF
;
863 opts
->alternateHandling
= UCOL_NON_IGNORABLE
; /* attribute for handling variable elements*/
864 opts
->caseFirst
= UCOL_OFF
; /* who goes first, lower case or uppercase */
865 opts
->caseLevel
= UCOL_OFF
; /* do we have an extra case level */
866 opts
->normalizationMode
= UCOL_OFF
; /* attribute for normalization */
867 opts
->hiraganaQ
= UCOL_OFF
; /* attribute for JIS X 4061, used only in Japanese */
868 opts
->numericCollation
= UCOL_OFF
;
869 myD
->jamoSpecial
= FALSE
;
871 tempUCATable
*t
= uprv_uca_initTempTable(myD
, opts
, NULL
, IMPLICIT_TAG
, LEAD_SURROGATE_TAG
, status
);
872 if(U_FAILURE(*status
))
874 fprintf(stderr
, "Failed to init UCA temp table: %s\n", u_errorName(*status
));
884 *****************************************************************************************
885 * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF
886 ******************************************************************************************
898 {0xAC00, 0xD7AF, UCOL_SPECIAL_FLAG
| (HANGUL_SYLLABLE_TAG
<< 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
899 {0xD800, 0xDBFF, UCOL_SPECIAL_FLAG
| (LEAD_SURROGATE_TAG
<< 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/
900 {0xDC00, 0xDFFF, UCOL_SPECIAL_FLAG
| (TRAIL_SURROGATE_TAG
<< 24) }, //2 TRAIL_SURROGATE DC00-DFFF
901 {0x3400, 0x4DB5, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/
902 {0x4E00, 0x9FA5, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/
903 {0xF900, 0xFA2D, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/
904 {0x20000, 0x2A6D6, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/
905 {0x2F800, 0x2FA1D, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/
907 {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG
| (HANGUL_SYLLABLE_TAG
<< 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
908 //{0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/
909 {0xDC00, 0xE000, UCOL_SPECIAL_FLAG
| (TRAIL_SURROGATE_TAG
<< 24) }, //2 TRAIL_SURROGATE DC00-DFFF
910 // Now directly handled in the collation code by the swapCJK function.
911 //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/
912 //{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/
913 //{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/
914 //{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/
915 //{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/
919 for(i
= 0; i
<sizeof(ranges
)/sizeof(ranges
[0]); i
++) {
920 /*ucmpe32_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value); */
921 utrie_setRange32(t
->mapping
, ranges
[i
].start
, ranges
[i
].end
, ranges
[i
].value
, TRUE
);
925 int32_t surrogateCount
= 0;
927 if(U_FAILURE(*status
)) {
928 fprintf(stderr
, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
929 *status
, u_errorName(*status
), (int)line
, filename
);
933 element
= readAnElement(data
, t
, &consts
, status
);
936 fprintf(stdout
, "%u ", (int)line
);
938 if(element
!= NULL
) {
939 // we have read the line, now do something sensible with the read data!
941 // Below stuff was taken care of in readAnElement
942 //if(element->variableTop == TRUE && variableTopValue == 0) {
943 // t->options->variableTopValue = element->cPoints[0];
946 // if element is a contraction, we want to add it to contractions
947 if(element
->cSize
> 1 && element
->cPoints
[0] != 0xFDD0) { // this is a contraction
948 if(UTF_IS_LEAD(element
->cPoints
[0]) && UTF_IS_TRAIL(element
->cPoints
[1]) && element
->cSize
== 2) {
951 contractionCEs
[noOfContractions
][0] = element
->cPoints
[0];
952 contractionCEs
[noOfContractions
][1] = element
->cPoints
[1];
953 if(element
->cSize
> 2) { // the third one
954 contractionCEs
[noOfContractions
][2] = element
->cPoints
[2];
956 contractionCEs
[noOfContractions
][2] = 0;
962 // TODO (claireho): does this work? Need more tests
963 // The following code is to handle the UCA pre-context rules
964 // for L/l with middle dot. We share the structures for contractionCombos.
965 // The format for pre-context character is
966 // contractionCEs[0]: codepoint in element->cPoints[0]
967 // contractionCEs[1]: '\0' to differentiate with contractions.
968 // contractionCEs[2]: prefix char
969 if (element
->prefixSize
>0) {
970 contractionCEs
[noOfContractions
][0]=element
->cPoints
[0];
971 contractionCEs
[noOfContractions
][1]='\0';
972 contractionCEs
[noOfContractions
][2]=element
->prefixChars
[0];
978 /* we're first adding to inverse, because addAnElement will reverse the order */
979 /* of code points and stuff... we don't want that to happen */
980 addToInverse(element
, status
);
981 if(!(element
->cSize
> 1 && element
->cPoints
[0] == 0xFDD0)) {
982 uprv_uca_addAnElement(t
, element
, status
);
987 if(UCAVersion
[0] == 0 && UCAVersion
[1] == 0 && UCAVersion
[2] == 0 && UCAVersion
[3] == 0) {
988 fprintf(stderr
, "UCA version not specified. Cannot create data file!\n");
989 uprv_uca_closeTempTable(t
);
996 uint32_t trieWord = utrie_get32(t->mapping, 0xDC01, NULL);
1000 fprintf(stdout
, "\nLines read: %u\n", (int)line
);
1001 fprintf(stdout
, "Surrogate count: %i\n", (int)surrogateCount
);
1002 fprintf(stdout
, "Raw data breakdown:\n");
1003 /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
1004 fprintf(stdout
, "Number of contractions: %u\n", (int)noOfContractions
);
1005 fprintf(stdout
, "Contraction image size: %u\n", (int)t
->image
->contractionSize
);
1006 fprintf(stdout
, "Expansions size: %i\n", (int)t
->expansions
->position
);
1010 /* produce canonical closure for table */
1011 /* first set up constants for implicit calculation */
1012 uprv_uca_initImplicitConstants(status
);
1013 /* do the closure */
1014 int32_t noOfClosures
= uprv_uca_canonicalClosure(t
, NULL
, status
);
1015 if(noOfClosures
!= 0) {
1016 fprintf(stderr
, "Warning: %i canonical closures occured!\n", (int)noOfClosures
);
1020 UCATableHeader
*myData
= uprv_uca_assembleTable(t
, status
);
1023 fprintf(stdout
, "Compacted data breakdown:\n");
1024 /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
1025 fprintf(stdout
, "Number of contractions: %u\n", (int)noOfContractions
);
1026 fprintf(stdout
, "Contraction image size: %u\n", (int)t
->image
->contractionSize
);
1027 fprintf(stdout
, "Expansions size: %i\n", (int)t
->expansions
->position
);
1030 if(U_FAILURE(*status
)) {
1031 fprintf(stderr
, "Error creating table: %s\n", u_errorName(*status
));
1032 uprv_uca_closeTempTable(t
);
1039 /* populate the version info struct with version info*/
1040 myData
->version
[0] = UCOL_BUILDER_VERSION
;
1041 myData
->version
[1] = UCAVersion
[0];
1042 myData
->version
[2] = UCAVersion
[1];
1043 myData
->version
[3] = UCAVersion
[2];
1044 /*TODO:The fractional rules version should be taken from FractionalUCA.txt*/
1045 // Removed this macro. Instead, we use the fields below
1046 //myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION;
1047 //myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt
1048 uprv_memcpy(myData
->UCAVersion
, UCAVersion
, sizeof(UVersionInfo
));
1049 u_getUnicodeVersion(myData
->UCDVersion
);
1051 writeOutData(myData
, &consts
, contractionCEs
, noOfContractions
, outputDir
, copyright
, status
);
1053 InverseUCATableHeader
*inverse
= assembleInverseTable(status
);
1054 uprv_memcpy(inverse
->UCAVersion
, UCAVersion
, sizeof(UVersionInfo
));
1055 writeOutInverseData(inverse
, outputDir
, copyright
, status
);
1057 uprv_uca_closeTempTable(t
);
1069 #endif /* #if !UCONFIG_NO_COLLATION */
1071 static UOption options
[]={
1072 UOPTION_HELP_H
, /* 0 Numbers for those who*/
1073 UOPTION_HELP_QUESTION_MARK
, /* 1 can't count. */
1074 UOPTION_COPYRIGHT
, /* 2 */
1075 UOPTION_VERSION
, /* 3 */
1076 UOPTION_DESTDIR
, /* 4 */
1077 UOPTION_SOURCEDIR
, /* 5 */
1078 UOPTION_VERBOSE
, /* 6 */
1079 UOPTION_ICUDATADIR
/* 7 */
1080 /* weiv can't count :))))) */
1083 int main(int argc
, char* argv
[]) {
1084 UErrorCode status
= U_ZERO_ERROR
;
1085 const char* destdir
= NULL
;
1086 const char* srcDir
= NULL
;
1088 char *basename
= NULL
;
1089 const char *copyright
= NULL
;
1090 uprv_memset(&UCAVersion
, 0, 4);
1092 U_MAIN_INIT_ARGS(argc
, argv
);
1094 /* preset then read command line options */
1095 options
[4].value
=u_getDataDirectory();
1096 options
[5].value
="";
1097 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
1099 /* error handling, printing usage message */
1102 "error in command line argument \"%s\"\n",
1107 if(options
[0].doesOccur
|| options
[1].doesOccur
) {
1109 "usage: %s [-options] file\n"
1110 "\tRead in UCA collation text data and write out the binary collation data\n"
1112 "\t-h or -? or --help this usage text\n"
1113 "\t-V or --version show a version message\n"
1114 "\t-c or --copyright include a copyright notice\n"
1115 "\t-d or --destdir destination directory, followed by the path\n"
1116 "\t-s or --sourcedir source directory, followed by the path\n"
1117 "\t-v or --verbose turn on verbose output\n"
1118 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
1119 "\t followed by path, defaults to %s\n",
1120 argv
[0], u_getDataDirectory());
1121 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
1123 if(options
[3].doesOccur
) {
1124 fprintf(stdout
, "genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n",
1125 #if UCONFIG_NO_COLLATION
1128 UCA_FORMAT_VERSION_0
, UCA_FORMAT_VERSION_1
1131 fprintf(stdout
, U_COPYRIGHT_STRING
"\n");
1135 /* get the options values */
1136 destdir
= options
[4].value
;
1137 srcDir
= options
[5].value
;
1138 VERBOSE
= options
[6].doesOccur
;
1140 if (options
[2].doesOccur
) {
1141 copyright
= U_COPYRIGHT_STRING
;
1144 if (options
[7].doesOccur
) {
1145 u_setDataDirectory(options
[7].value
);
1147 /* Initialize ICU */
1149 if (U_FAILURE(status
) && status
!= U_FILE_ACCESS_ERROR
) {
1150 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
1151 argv
[0], u_errorName(status
));
1154 status
= U_ZERO_ERROR
;
1157 /* prepare the filename beginning with the source dir */
1158 uprv_strcpy(filename
, srcDir
);
1159 basename
=filename
+uprv_strlen(filename
);
1161 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
1162 *basename
++ = U_FILE_SEP_CHAR
;
1166 uprv_strcpy(basename
, "FractionalUCA.txt");
1169 uprv_strcpy(basename
, getLongPathname(*argv
));
1173 if(u_getCombiningClass(0x0053) == 0)
1175 fprintf(stderr
, "SEVERE ERROR: Normalization data is not functioning! Bailing out. Was not able to load unorm.dat.\n");
1180 #if UCONFIG_NO_COLLATION
1182 UNewDataMemory
*pData
;
1185 msg
= "genuca writes dummy " UCA_DATA_NAME
"." UCA_DATA_TYPE
" because of UCONFIG_NO_COLLATION, see uconfig.h";
1186 fprintf(stderr
, "%s\n", msg
);
1187 pData
= udata_create(destdir
, UCA_DATA_TYPE
, UCA_DATA_NAME
, &dummyDataInfo
,
1189 udata_writeBlock(pData
, msg
, strlen(msg
));
1190 udata_finish(pData
, &status
);
1192 msg
= "genuca writes dummy " INVC_DATA_NAME
"." INVC_DATA_TYPE
" because of UCONFIG_NO_COLLATION, see uconfig.h";
1193 fprintf(stderr
, "%s\n", msg
);
1194 pData
= udata_create(destdir
, INVC_DATA_TYPE
, INVC_DATA_NAME
, &dummyDataInfo
,
1196 udata_writeBlock(pData
, msg
, strlen(msg
));
1197 udata_finish(pData
, &status
);
1203 return write_uca_table(filename
, destdir
, copyright
, &status
);
1209 * Hey, Emacs, please set the following:
1212 * indent-tabs-mode: nil