2 *******************************************************************************
4 * Copyright (C) 2000-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: genuca.cpp
10 * tab size: 8 (not used)
13 * created at the end of XX century
14 * created by: Vladimir Weinstein
16 * This program reads the Franctional UCA table and generates
17 * internal format for UCA table as well as inverse UCA table.
18 * It then writes binary files containing the data: ucadata.dat
21 * 02/23/2001 grhoten Made it into a tool
22 * 02/23/2001 weiv Moved element & table handling code to i18n
23 * 05/09/2001 weiv Case bits are now in the CEs, not in front
26 #include "unicode/utypes.h"
27 #include "unicode/putil.h"
28 #include "unicode/udata.h"
29 #include "unicode/uclean.h"
43 UBool VERBOSE
= FALSE
;
45 static UVersionInfo UCAVersion
;
47 #if UCONFIG_NO_COLLATION
49 /* dummy UDataInfo cf. udata.h */
50 static UDataInfo dummyDataInfo
= {
59 { 0, 0, 0, 0 }, /* dummy dataFormat */
60 { 0, 0, 0, 0 }, /* dummy formatVersion */
61 { 0, 0, 0, 0 } /* dummy dataVersion */
66 static const UDataInfo ucaDataInfo
={
75 {UCA_DATA_FORMAT_0
, UCA_DATA_FORMAT_1
, UCA_DATA_FORMAT_2
, UCA_DATA_FORMAT_3
}, /* dataFormat="UCol" */
76 /* 03/26/2002 bumped up version since format has changed */
77 /* 09/16/2002 bumped up version since we went from UColAttributeValue */
78 /* to int32_t in UColOptionSet */
79 /* 05/13/2003 This one also updated since we added UCA and UCD versions */
81 /* 09/11/2003 Adding information required by data swapper */
82 {UCA_FORMAT_VERSION_0
, UCA_FORMAT_VERSION_1
, UCA_FORMAT_VERSION_2
, UCA_FORMAT_VERSION_3
}, /* formatVersion */
83 {0, 0, 0, 0} /* dataVersion = Unicode Version*/
86 static const UDataInfo invUcaDataInfo
={
95 {INVUCA_DATA_FORMAT_0
, INVUCA_DATA_FORMAT_1
, INVUCA_DATA_FORMAT_2
, INVUCA_DATA_FORMAT_3
}, /* dataFormat="InvC" */
96 /* 03/26/2002 bumped up version since format has changed */
97 /* 04/29/2003 2.1 format - we have added UCA version to header */
98 {INVUCA_FORMAT_VERSION_0
, INVUCA_FORMAT_VERSION_1
, INVUCA_FORMAT_VERSION_2
, INVUCA_FORMAT_VERSION_3
}, /* formatVersion */
99 {0, 0, 0, 0} /* dataVersion = Unicode Version*/
104 int32_t readElement(char **from
, char *to
, char separator
, UErrorCode
*status
) {
105 if(U_FAILURE(*status
)) {
110 while(**from
!= separator
) {
112 *(buffer
+i
++) = **from
;
118 //*to = (char *)malloc(strlen(buffer)+1);
124 uint32_t getSingleCEValue(char *primary
, char *secondary
, char *tertiary
, UErrorCode
*status
) {
125 if(U_FAILURE(*status
)) {
129 char primsave
= '\0';
132 char *primend
= primary
+4;
133 if(strlen(primary
) > 4) {
137 char *secend
= secondary
+2;
138 if(strlen(secondary
) > 2) {
142 char *terend
= tertiary
+2;
143 if(strlen(tertiary
) > 2) {
147 uint32_t primvalue
= (uint32_t)((*primary
!='\0')?strtoul(primary
, &primend
, 16):0);
148 uint32_t secvalue
= (uint32_t)((*secondary
!='\0')?strtoul(secondary
, &secend
, 16):0);
149 uint32_t tervalue
= (uint32_t)((*tertiary
!='\0')?strtoul(tertiary
, &terend
, 16):0);
150 if(primvalue
<= 0xFF) {
154 value
= ((primvalue
<<UCOL_PRIMARYORDERSHIFT
)&UCOL_PRIMARYORDERMASK
)|
155 ((secvalue
<<UCOL_SECONDARYORDERSHIFT
)&UCOL_SECONDARYORDERMASK
)|
156 (tervalue
&UCOL_TERTIARYORDERMASK
);
170 static uint32_t inverseTable
[0xFFFF][3];
171 static uint32_t inversePos
= 0;
172 static UChar stringContinue
[0xFFFF];
173 static uint32_t sContPos
= 0;
175 static void addNewInverse(UCAElements
*element
, UErrorCode
*status
) {
176 if(U_FAILURE(*status
)) {
179 if(VERBOSE
&& isContinuation(element
->CEs
[1])) {
180 //fprintf(stdout, "+");
183 inverseTable
[inversePos
][0] = element
->CEs
[0];
184 if(element
->noOfCEs
> 1 && isContinuation(element
->CEs
[1])) {
185 inverseTable
[inversePos
][1] = element
->CEs
[1];
187 inverseTable
[inversePos
][1] = 0;
189 if(element
->cSize
< 2) {
190 inverseTable
[inversePos
][2] = element
->cPoints
[0];
191 } else { /* add a new store of cruft */
192 inverseTable
[inversePos
][2] = ((element
->cSize
+1) << UCOL_INV_SHIFTVALUE
) | sContPos
;
193 memcpy(stringContinue
+sContPos
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
194 sContPos
+= element
->cSize
+1;
198 static void insertInverse(UCAElements
*element
, uint32_t position
, UErrorCode
*status
) {
199 if(U_FAILURE(*status
)) {
203 if(VERBOSE
&& isContinuation(element
->CEs
[1])) {
204 //fprintf(stdout, "+");
206 if(position
<= inversePos
) {
207 /*move stuff around */
208 uint32_t amountToMove
= (inversePos
- position
+1)*sizeof(inverseTable
[0]);
209 uprv_memmove(inverseTable
[position
+1], inverseTable
[position
], amountToMove
);
211 inverseTable
[position
][0] = element
->CEs
[0];
212 if(element
->noOfCEs
> 1 && isContinuation(element
->CEs
[1])) {
213 inverseTable
[position
][1] = element
->CEs
[1];
215 inverseTable
[position
][1] = 0;
217 if(element
->cSize
< 2) {
218 inverseTable
[position
][2] = element
->cPoints
[0];
219 } else { /* add a new store of cruft */
220 inverseTable
[position
][2] = ((element
->cSize
+1) << UCOL_INV_SHIFTVALUE
) | sContPos
;
221 memcpy(stringContinue
+sContPos
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
222 sContPos
+= element
->cSize
+1;
227 static void addToExistingInverse(UCAElements
*element
, uint32_t position
, UErrorCode
*status
) {
229 if(U_FAILURE(*status
)) {
233 if((inverseTable
[position
][2] & UCOL_INV_SIZEMASK
) == 0) { /* single element, have to make new extension place and put both guys there */
234 stringContinue
[sContPos
] = (UChar
)inverseTable
[position
][2];
235 inverseTable
[position
][2] = ((element
->cSize
+3) << UCOL_INV_SHIFTVALUE
) | sContPos
;
237 stringContinue
[sContPos
++] = 0xFFFF;
238 memcpy(stringContinue
+sContPos
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
239 sContPos
+= element
->cSize
;
240 stringContinue
[sContPos
++] = 0xFFFE;
241 } else { /* adding to the already existing continuing table */
242 uint32_t contIndex
= inverseTable
[position
][2] & UCOL_INV_OFFSETMASK
;
243 uint32_t contSize
= (inverseTable
[position
][2] & UCOL_INV_SIZEMASK
) >> UCOL_INV_SHIFTVALUE
;
245 if(contIndex
+contSize
< sContPos
) {
246 /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/
247 memcpy(stringContinue
+contIndex
+contSize
+element
->cSize
+1, stringContinue
+contIndex
+contSize
, (element
->cSize
+1)*sizeof(UChar
));
250 stringContinue
[contIndex
+contSize
-1] = 0xFFFF;
251 memcpy(stringContinue
+contIndex
+contSize
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
252 sContPos
+= element
->cSize
+1;
253 stringContinue
[contIndex
+contSize
+element
->cSize
] = 0xFFFE;
255 inverseTable
[position
][2] = ((contSize
+element
->cSize
+1) << UCOL_INV_SHIFTVALUE
) | contIndex
;
260 * Takes two CEs (lead and continuation) and
261 * compares them as CEs should be compared:
262 * primary vs. primary, secondary vs. secondary
263 * tertiary vs. tertiary
265 static int32_t compareCEs(uint32_t *source
, uint32_t *target
) {
266 uint32_t s1
= source
[0], s2
, t1
= target
[0], t2
;
267 if(isContinuation(source
[1])) {
272 if(isContinuation(target
[1])) {
278 uint32_t s
= 0, t
= 0;
279 if(s1
== t1
&& s2
== t2
) {
282 s
= (s1
& 0xFFFF0000)|((s2
& 0xFFFF0000)>>16);
283 t
= (t1
& 0xFFFF0000)|((t2
& 0xFFFF0000)>>16);
289 s
= (s1
& 0x0000FF00) | (s2
& 0x0000FF00)>>8;
290 t
= (t1
& 0x0000FF00) | (t2
& 0x0000FF00)>>8;
296 s
= (s1
& 0x000000FF)<<8 | (s2
& 0x000000FF);
297 t
= (t1
& 0x000000FF)<<8 | (t2
& 0x000000FF);
307 static uint32_t addToInverse(UCAElements
*element
, UErrorCode
*status
) {
308 uint32_t position
= inversePos
;
309 uint32_t saveElement
= element
->CEs
[0];
310 int32_t compResult
= 0;
311 element
->CEs
[0] &= 0xFFFFFF3F;
312 if(element
->noOfCEs
== 1) {
315 if(inversePos
== 0) {
316 inverseTable
[0][0] = inverseTable
[0][1] = inverseTable
[0][2] = 0;
317 addNewInverse(element
, status
);
318 } else if(compareCEs(inverseTable
[inversePos
], element
->CEs
) > 0) {
319 while((compResult
= compareCEs(inverseTable
[--position
], element
->CEs
)) > 0);
320 if(VERBOSE
) { fprintf(stdout
, "p:%u ", (int)position
); }
321 if(compResult
== 0) {
322 addToExistingInverse(element
, position
, status
);
324 insertInverse(element
, position
+1, status
);
326 } else if(compareCEs(inverseTable
[inversePos
], element
->CEs
) == 0) {
327 addToExistingInverse(element
, inversePos
, status
);
329 addNewInverse(element
, status
);
331 element
->CEs
[0] = saveElement
;
332 if(VERBOSE
) { fprintf(stdout
, "+"); }
336 static InverseUCATableHeader
*assembleInverseTable(UErrorCode
*status
)
338 InverseUCATableHeader
*result
= NULL
;
339 uint32_t headerByteSize
= paddedsize(sizeof(InverseUCATableHeader
));
340 uint32_t inverseTableByteSize
= (inversePos
+2)*sizeof(uint32_t)*3;
341 uint32_t contsByteSize
= sContPos
* sizeof(UChar
);
344 result
= (InverseUCATableHeader
*)uprv_malloc(headerByteSize
+ inverseTableByteSize
+ contsByteSize
);
345 uprv_memset(result
, 0, headerByteSize
+ inverseTableByteSize
+ contsByteSize
);
347 result
->byteSize
= headerByteSize
+ inverseTableByteSize
+ contsByteSize
;
350 inverseTable
[inversePos
][0] = 0xFFFFFFFF;
351 inverseTable
[inversePos
][1] = 0xFFFFFFFF;
352 inverseTable
[inversePos
][2] = 0x0000FFFF;
355 for(i
= 2; i
<inversePos
; i
++) {
356 if(compareCEs(inverseTable
[i
-1], inverseTable
[i
]) > 0) {
357 fprintf(stderr
, "Error at %i: %08X & %08X\n", (int)i
, (int)inverseTable
[i
-1][0], (int)inverseTable
[i
][0]);
358 } else if(inverseTable
[i
-1][0] == inverseTable
[i
][0] && !(inverseTable
[i
-1][1] < inverseTable
[i
][1])) {
359 fprintf(stderr
, "Continuation error at %i: %08X %08X & %08X %08X\n", (int)i
, (int)inverseTable
[i
-1][0], (int)inverseTable
[i
-1][1], (int)inverseTable
[i
][0], (int)inverseTable
[i
][1]);
363 result
->tableSize
= inversePos
;
364 result
->contsSize
= sContPos
;
366 result
->table
= headerByteSize
;
367 result
->conts
= headerByteSize
+ inverseTableByteSize
;
369 memcpy((uint8_t *)result
+ result
->table
, inverseTable
, inverseTableByteSize
);
370 memcpy((uint8_t *)result
+ result
->conts
, stringContinue
, contsByteSize
);
373 *status
= U_MEMORY_ALLOCATION_ERROR
;
381 static void writeOutInverseData(InverseUCATableHeader
*data
,
382 const char *outputDir
,
383 const char *copyright
,
386 UNewDataMemory
*pData
;
390 UDataInfo invUcaInfo
;
391 uprv_memcpy(&invUcaInfo
, &invUcaDataInfo
, sizeof(UDataInfo
));
392 u_getUnicodeVersion(invUcaInfo
.dataVersion
);
394 pData
=udata_create(outputDir
, INVC_DATA_TYPE
, INVC_DATA_NAME
, &invUcaInfo
,
397 if(U_FAILURE(*status
)) {
398 fprintf(stderr
, "Error: unable to create %s"INVC_DATA_NAME
", error %s\n", outputDir
, u_errorName(*status
));
402 /* write the data to the file */
404 fprintf(stdout
, "Writing out inverse UCA table: %s%c%s.%s\n", outputDir
, U_FILE_SEP_CHAR
,
408 udata_writeBlock(pData
, data
, data
->byteSize
);
411 dataLength
=udata_finish(pData
, status
);
412 if(U_FAILURE(*status
)) {
413 fprintf(stderr
, "Error: error %d writing the output file\n", *status
);
420 static int32_t hex2num(char hex
) {
421 if(hex
>='0' && hex
<='9') {
423 } else if(hex
>='a' && hex
<='f') {
425 } else if(hex
>='A' && hex
<='F') {
432 UCAElements
*readAnElement(FILE *data
, tempUCATable
*t
, UCAConstants
*consts
, UErrorCode
*status
) {
433 char buffer
[2048], primary
[100], secondary
[100], tertiary
[100];
434 UBool detectedContraction
;
436 unsigned int theValue
;
437 char *pointer
= NULL
;
438 char *commentStart
= NULL
;
439 char *startCodePoint
= NULL
;
440 char *endCodePoint
= NULL
;
441 char *spacePointer
= NULL
;
442 char *result
= fgets(buffer
, 2048, data
);
443 int32_t buflen
= (int32_t)uprv_strlen(buffer
);
444 if(U_FAILURE(*status
)) {
447 *primary
= *secondary
= *tertiary
= '\0';
452 fprintf(stderr
, "empty line but no EOF!\n");
453 *status
= U_INVALID_FORMAT_ERROR
;
457 while(buflen
>0 && (buffer
[buflen
-1] == '\r' || buffer
[buflen
-1] == '\n')) {
458 buffer
[--buflen
] = 0;
461 if(buffer
[0] == 0 || buffer
[0] == '#') {
462 return NULL
; // just a comment, skip whole line
465 UCAElements
*element
= &le
; //(UCAElements *)malloc(sizeof(UCAElements));
474 if(buffer
[0] == '[') {
479 ActionType what_to_do
;
480 } vt
[] = { {"[first tertiary ignorable", consts
->UCA_FIRST_TERTIARY_IGNORABLE
, READCE
},
481 {"[last tertiary ignorable", consts
->UCA_LAST_TERTIARY_IGNORABLE
, READCE
},
482 {"[first secondary ignorable", consts
->UCA_FIRST_SECONDARY_IGNORABLE
, READCE
},
483 {"[last secondary ignorable", consts
->UCA_LAST_SECONDARY_IGNORABLE
, READCE
},
484 {"[first primary ignorable", consts
->UCA_FIRST_PRIMARY_IGNORABLE
, READCE
},
485 {"[last primary ignorable", consts
->UCA_LAST_PRIMARY_IGNORABLE
, READCE
},
486 {"[first variable", consts
->UCA_FIRST_VARIABLE
, READCE
},
487 {"[last variable", consts
->UCA_LAST_VARIABLE
, READCE
},
488 {"[first regular", consts
->UCA_FIRST_NON_VARIABLE
, READCE
},
489 {"[last regular", consts
->UCA_LAST_NON_VARIABLE
, READCE
},
490 {"[first implicit", consts
->UCA_FIRST_IMPLICIT
, READCE
},
491 {"[last implicit", consts
->UCA_LAST_IMPLICIT
, READCE
},
492 {"[first trailing", consts
->UCA_FIRST_TRAILING
, READCE
},
493 {"[last trailing", consts
->UCA_LAST_TRAILING
, READCE
},
495 {"[fixed top", &consts
->UCA_PRIMARY_TOP_MIN
, READHEX
},
496 {"[fixed first implicit byte", &consts
->UCA_PRIMARY_IMPLICIT_MIN
, READHEX
},
497 {"[fixed last implicit byte", &consts
->UCA_PRIMARY_IMPLICIT_MAX
, READHEX
},
498 {"[fixed first trail byte", &consts
->UCA_PRIMARY_TRAILING_MIN
, READHEX
},
499 {"[fixed last trail byte", &consts
->UCA_PRIMARY_TRAILING_MAX
, READHEX
},
500 {"[fixed first special byte", &consts
->UCA_PRIMARY_SPECIAL_MIN
, READHEX
},
501 {"[fixed last special byte", &consts
->UCA_PRIMARY_SPECIAL_MAX
, READHEX
},
502 {"[variable top = ", &t
->options
->variableTopValue
, READHEX
},
503 {"[UCA version = ", NULL
, READUCAVERSION
}
505 for (cnt
= 0; cnt
<sizeof(vt
)/sizeof(vt
[0]); cnt
++) {
506 uint32_t vtLen
= (uint32_t)uprv_strlen(vt
[cnt
].name
);
507 if(uprv_strncmp(buffer
, vt
[cnt
].name
, vtLen
) == 0) {
508 element
->variableTop
= TRUE
;
509 if(vt
[cnt
].what_to_do
== READHEX
) {
510 if(sscanf(buffer
+vtLen
, "%4x", &theValue
) != 1) /* read first code point */
512 fprintf(stderr
, " scanf(hex) failed on !\n ");
514 *(vt
[cnt
].what
) = (UChar
)theValue
;
515 //if(cnt == 1) { // first implicit
516 // we need to set the value for top next
517 //uint32_t nextTop = ucol_prv_calculateImplicitPrimary(0x4E00); // CJK base
518 //consts->UCA_NEXT_TOP_VALUE = theValue<<24 | 0x030303;
520 } else if (vt
[cnt
].what_to_do
== READCE
) { /* vt[cnt].what_to_do == READCE */
521 pointer
= strchr(buffer
+vtLen
, '[');
524 element
->sizePrim
[0]=readElement(&pointer
, primary
, ',', status
);
525 element
->sizeSec
[0]=readElement(&pointer
, secondary
, ',', status
);
526 element
->sizeTer
[0]=readElement(&pointer
, tertiary
, ']', status
);
528 vt
[cnt
].what
[0] = getSingleCEValue(primary
, secondary
, tertiary
, status
);
529 if(element
->sizePrim
[0] > 2 || element
->sizeSec
[0] > 1 || element
->sizeTer
[0] > 1) {
531 uint32_t value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
532 if(2*CEi
<element
->sizePrim
[i
]) {
533 value
|= ((hex2num(*(primary
+4*CEi
))&0xF)<<28);
534 value
|= ((hex2num(*(primary
+4*CEi
+1))&0xF)<<24);
537 if(2*CEi
+1<element
->sizePrim
[i
]) {
538 value
|= ((hex2num(*(primary
+4*CEi
+2))&0xF)<<20);
539 value
|= ((hex2num(*(primary
+4*CEi
+3))&0xF)<<16);
542 if(CEi
<element
->sizeSec
[i
]) {
543 value
|= ((hex2num(*(secondary
+2*CEi
))&0xF)<<12);
544 value
|= ((hex2num(*(secondary
+2*CEi
+1))&0xF)<<8);
547 if(CEi
<element
->sizeTer
[i
]) {
548 value
|= ((hex2num(*(tertiary
+2*CEi
))&0x3)<<4);
549 value
|= (hex2num(*(tertiary
+2*CEi
+1))&0xF);
554 vt
[cnt
].what
[1] = value
;
555 //element->CEs[CEindex++] = value;
560 fprintf(stderr
, "Failed to read a CE from line %s\n", buffer
);
562 } else { //vt[cnt].what_to_do == READUCAVERSION
563 u_versionFromString(UCAVersion
, buffer
+vtLen
);
565 fprintf(stdout
, "UCA version [%hu.%hu.%hu.%hu]\n", UCAVersion
[0], UCAVersion
[1], UCAVersion
[2], UCAVersion
[3]);
568 //element->cPoints[0] = (UChar)theValue;
573 fprintf(stderr
, "Warning: unrecognized option: %s\n", buffer
);
574 //*status = U_INVALID_FORMAT_ERROR;
577 element
->variableTop
= FALSE
;
579 startCodePoint
= buffer
;
580 endCodePoint
= strchr(startCodePoint
, ';');
582 if(endCodePoint
== 0) {
583 fprintf(stderr
, "error - line with no code point!\n");
584 *status
= U_INVALID_FORMAT_ERROR
; /* No code point - could be an error, but probably only an empty line */
590 if(element
!= NULL
) {
591 memset(element
, 0, sizeof(*element
));
593 *status
= U_MEMORY_ALLOCATION_ERROR
;
597 element
->cPoints
= element
->uchars
;
599 spacePointer
= strchr(buffer
, ' ');
600 if(sscanf(buffer
, "%4x", &theValue
) != 1) /* read first code point */
602 fprintf(stderr
, " scanf(hex) failed!\n ");
604 element
->cPoints
[0] = (UChar
)theValue
;
606 if(spacePointer
== 0) {
607 detectedContraction
= FALSE
;
611 detectedContraction
= TRUE
;
612 while(spacePointer
!= NULL
) {
613 sscanf(spacePointer
+1, "%4x", &theValue
);
614 element
->cPoints
[i
++] = (UChar
)theValue
;
615 spacePointer
= strchr(spacePointer
+1, ' ');
620 //fprintf(stderr, "Number of codepoints in contraction: %i\n", i);
623 startCodePoint
= endCodePoint
+1;
625 commentStart
= strchr(startCodePoint
, '#');
626 if(commentStart
== NULL
) {
627 commentStart
= strlen(startCodePoint
) + startCodePoint
;
631 uint32_t CEindex
= 0;
632 element
->noOfCEs
= 0;
634 endCodePoint
= strchr(startCodePoint
, ']');
635 if(endCodePoint
== NULL
|| endCodePoint
>= commentStart
) {
638 pointer
= strchr(startCodePoint
, '[');
641 element
->sizePrim
[i
]=readElement(&pointer
, primary
, ',', status
);
642 element
->sizeSec
[i
]=readElement(&pointer
, secondary
, ',', status
);
643 element
->sizeTer
[i
]=readElement(&pointer
, tertiary
, ']', status
);
646 /* I want to get the CEs entered right here, including continuation */
647 element
->CEs
[CEindex
++] = getSingleCEValue(primary
, secondary
, tertiary
, status
);
650 while(2*CEi
<element
->sizePrim
[i
] || CEi
<element
->sizeSec
[i
] || CEi
<element
->sizeTer
[i
]) {
651 uint32_t value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
652 if(2*CEi
<element
->sizePrim
[i
]) {
653 value
|= ((hex2num(*(primary
+4*CEi
))&0xF)<<28);
654 value
|= ((hex2num(*(primary
+4*CEi
+1))&0xF)<<24);
657 if(2*CEi
+1<element
->sizePrim
[i
]) {
658 value
|= ((hex2num(*(primary
+4*CEi
+2))&0xF)<<20);
659 value
|= ((hex2num(*(primary
+4*CEi
+3))&0xF)<<16);
662 if(CEi
<element
->sizeSec
[i
]) {
663 value
|= ((hex2num(*(secondary
+2*CEi
))&0xF)<<12);
664 value
|= ((hex2num(*(secondary
+2*CEi
+1))&0xF)<<8);
667 if(CEi
<element
->sizeTer
[i
]) {
668 value
|= ((hex2num(*(tertiary
+2*CEi
))&0x3)<<4);
669 value
|= (hex2num(*(tertiary
+2*CEi
+1))&0xF);
674 element
->CEs
[CEindex
++] = value
;
677 startCodePoint
= endCodePoint
+1;
680 element
->noOfCEs
= CEindex
;
682 element
->isThai
= UCOL_ISTHAIPREVOWEL(element
->cPoints
[0]);
684 // we don't want any strange stuff after useful data!
685 while(pointer
< commentStart
) {
686 if(*pointer
!= ' ' && *pointer
!= '\t')
688 *status
=U_INVALID_FORMAT_ERROR
;
694 if(U_FAILURE(*status
)) {
695 fprintf(stderr
, "problem putting stuff in hash table %s\n", u_errorName(*status
));
696 *status
= U_INTERNAL_PROGRAM_ERROR
;
704 void writeOutData(UCATableHeader
*data
,
705 UCAConstants
*consts
,
706 UChar contractions
[][3],
707 uint32_t noOfcontractions
,
708 const char *outputDir
,
709 const char *copyright
,
712 if(U_FAILURE(*status
)) {
716 uint32_t size
= data
->size
;
718 data
->UCAConsts
= data
->size
;
719 data
->size
+= paddedsize(sizeof(UCAConstants
));
721 if(noOfcontractions
!= 0) {
722 contractions
[noOfcontractions
][0] = 0;
723 contractions
[noOfcontractions
][1] = 0;
724 contractions
[noOfcontractions
][2] = 0;
728 data
->contractionUCACombos
= data
->size
;
729 data
->contractionUCACombosWidth
= 3;
730 data
->contractionUCACombosSize
= noOfcontractions
;
731 data
->size
+= paddedsize((noOfcontractions
*3*sizeof(UChar
)));
734 UNewDataMemory
*pData
;
738 uprv_memcpy(&ucaInfo
, &ucaDataInfo
, sizeof(UDataInfo
));
739 u_getUnicodeVersion(ucaInfo
.dataVersion
);
741 pData
=udata_create(outputDir
, UCA_DATA_TYPE
, UCA_DATA_NAME
, &ucaInfo
,
744 if(U_FAILURE(*status
)) {
745 fprintf(stderr
, "Error: unable to create %s"UCA_DATA_NAME
", error %s\n", outputDir
, u_errorName(*status
));
749 /* write the data to the file */
751 fprintf(stdout
, "Writing out UCA table: %s%c%s.%s\n", outputDir
,
753 U_ICUDATA_NAME
"_" UCA_DATA_NAME
,
756 udata_writeBlock(pData
, data
, size
);
758 // output the constants here
759 udata_writeBlock(pData
, consts
, sizeof(UCAConstants
));
761 if(noOfcontractions
!= 0) {
762 udata_writeBlock(pData
, contractions
, noOfcontractions
*3*sizeof(UChar
));
763 udata_writePadding(pData
, paddedsize((noOfcontractions
*3*sizeof(UChar
))) - noOfcontractions
*3*sizeof(uint16_t));
767 dataLength
=udata_finish(pData
, status
);
768 if(U_FAILURE(*status
)) {
769 fprintf(stderr
, "Error: error %d writing the output file\n", *status
);
775 write_uca_table(const char *filename
,
776 const char *outputDir
,
777 const char *copyright
,
780 FILE *data
= fopen(filename
, "r");
782 UCAElements
*element
= NULL
;
783 UChar variableTopValue
= 0;
784 UCATableHeader
*myD
= (UCATableHeader
*)uprv_malloc(sizeof(UCATableHeader
));
787 *status
= U_MEMORY_ALLOCATION_ERROR
;
791 uprv_memset(myD
, 0, sizeof(UCATableHeader
));
792 UColOptionSet
*opts
= (UColOptionSet
*)uprv_malloc(sizeof(UColOptionSet
));
795 *status
= U_MEMORY_ALLOCATION_ERROR
;
800 uprv_memset(opts
, 0, sizeof(UColOptionSet
));
801 UChar contractionCEs
[256][3];
802 uprv_memset(contractionCEs
, 0, 256*3*sizeof(UChar
));
803 uint32_t noOfContractions
= 0;
805 uprv_memset(&consts
, 0, sizeof(consts
));
807 UCAConstants consts
= {
808 UCOL_RESET_TOP_VALUE
,
809 UCOL_FIRST_PRIMARY_IGNORABLE
,
810 UCOL_LAST_PRIMARY_IGNORABLE
,
811 UCOL_LAST_PRIMARY_IGNORABLE_CONT
,
812 UCOL_FIRST_SECONDARY_IGNORABLE
,
813 UCOL_LAST_SECONDARY_IGNORABLE
,
814 UCOL_FIRST_TERTIARY_IGNORABLE
,
815 UCOL_LAST_TERTIARY_IGNORABLE
,
818 UCOL_FIRST_NON_VARIABLE
,
819 UCOL_LAST_NON_VARIABLE
,
823 UCOL_NEXT_FIRST_PRIMARY_IGNORABLE,
824 UCOL_NEXT_LAST_PRIMARY_IGNORABLE,
825 UCOL_NEXT_FIRST_SECONDARY_IGNORABLE,
826 UCOL_NEXT_LAST_SECONDARY_IGNORABLE,
827 UCOL_NEXT_FIRST_TERTIARY_IGNORABLE,
828 UCOL_NEXT_LAST_TERTIARY_IGNORABLE,
829 UCOL_NEXT_FIRST_VARIABLE,
830 UCOL_NEXT_LAST_VARIABLE,
833 PRIMARY_IMPLICIT_MIN
,
840 fprintf(stderr
, "Couldn't open file: %s\n", filename
);
844 uprv_memset(inverseTable
, 0xDA, sizeof(int32_t)*3*0xFFFF);
846 opts
->variableTopValue
= variableTopValue
;
847 opts
->strength
= UCOL_TERTIARY
;
848 opts
->frenchCollation
= UCOL_OFF
;
849 opts
->alternateHandling
= UCOL_NON_IGNORABLE
; /* attribute for handling variable elements*/
850 opts
->caseFirst
= UCOL_OFF
; /* who goes first, lower case or uppercase */
851 opts
->caseLevel
= UCOL_OFF
; /* do we have an extra case level */
852 opts
->normalizationMode
= UCOL_OFF
; /* attribute for normalization */
853 opts
->hiraganaQ
= UCOL_OFF
; /* attribute for JIS X 4061, used only in Japanese */
854 opts
->numericCollation
= UCOL_OFF
;
855 myD
->jamoSpecial
= FALSE
;
857 tempUCATable
*t
= uprv_uca_initTempTable(myD
, opts
, NULL
, IMPLICIT_TAG
, LEAD_SURROGATE_TAG
, status
);
858 if(U_FAILURE(*status
))
860 fprintf(stderr
, "Failed to init UCA temp table: %s\n", u_errorName(*status
));
867 *****************************************************************************************
868 * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF
869 ******************************************************************************************
881 {0xAC00, 0xD7AF, UCOL_SPECIAL_FLAG
| (HANGUL_SYLLABLE_TAG
<< 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
882 {0xD800, 0xDBFF, UCOL_SPECIAL_FLAG
| (LEAD_SURROGATE_TAG
<< 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/
883 {0xDC00, 0xDFFF, UCOL_SPECIAL_FLAG
| (TRAIL_SURROGATE_TAG
<< 24) }, //2 TRAIL_SURROGATE DC00-DFFF
884 {0x3400, 0x4DB5, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/
885 {0x4E00, 0x9FA5, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/
886 {0xF900, 0xFA2D, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/
887 {0x20000, 0x2A6D6, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/
888 {0x2F800, 0x2FA1D, UCOL_SPECIAL_FLAG
| (CJK_IMPLICIT_TAG
<< 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/
890 {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG
| (HANGUL_SYLLABLE_TAG
<< 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
891 //{0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/
892 {0xDC00, 0xE000, UCOL_SPECIAL_FLAG
| (TRAIL_SURROGATE_TAG
<< 24) }, //2 TRAIL_SURROGATE DC00-DFFF
893 // Now directly handled in the collation code by the swapCJK function.
894 //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/
895 //{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/
896 //{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/
897 //{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/
898 //{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/
902 for(i
= 0; i
<sizeof(ranges
)/sizeof(ranges
[0]); i
++) {
903 /*ucmpe32_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value); */
904 utrie_setRange32(t
->mapping
, ranges
[i
].start
, ranges
[i
].end
, ranges
[i
].value
, TRUE
);
908 int32_t surrogateCount
= 0;
910 if(U_FAILURE(*status
)) {
911 fprintf(stderr
, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
912 *status
, u_errorName(*status
), (int)line
, filename
);
916 element
= readAnElement(data
, t
, &consts
, status
);
919 fprintf(stdout
, "%u ", (int)line
);
921 if(element
!= NULL
) {
922 // we have read the line, now do something sensible with the read data!
924 // Below stuff was taken care of in readAnElement
925 //if(element->variableTop == TRUE && variableTopValue == 0) {
926 // t->options->variableTopValue = element->cPoints[0];
929 // if element is a contraction, we want to add it to contractions
930 if(element
->cSize
> 1 && element
->cPoints
[0] != 0xFDD0) { // this is a contraction
931 if(UTF_IS_LEAD(element
->cPoints
[0]) && UTF_IS_TRAIL(element
->cPoints
[1]) && element
->cSize
== 2) {
934 contractionCEs
[noOfContractions
][0] = element
->cPoints
[0];
935 contractionCEs
[noOfContractions
][1] = element
->cPoints
[1];
936 if(element
->cSize
> 2) { // the third one
937 contractionCEs
[noOfContractions
][2] = element
->cPoints
[2];
939 contractionCEs
[noOfContractions
][2] = 0;
945 /* we're first adding to inverse, because addAnElement will reverse the order */
946 /* of code points and stuff... we don't want that to happen */
947 addToInverse(element
, status
);
948 if(!(element
->cSize
> 1 && element
->cPoints
[0] == 0xFDD0)) {
949 uprv_uca_addAnElement(t
, element
, status
);
954 if(UCAVersion
[0] == 0 && UCAVersion
[1] == 0 && UCAVersion
[2] == 0 && UCAVersion
[3] == 0) {
955 fprintf(stderr
, "UCA version not specified. Cannot create data file!\n");
959 uint32_t trieWord = utrie_get32(t->mapping, 0xDC01, NULL);
963 fprintf(stdout
, "\nLines read: %u\n", (int)line
);
964 fprintf(stdout
, "Surrogate count: %i\n", (int)surrogateCount
);
965 fprintf(stdout
, "Raw data breakdown:\n");
966 /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
967 fprintf(stdout
, "Number of contractions: %u\n", (int)noOfContractions
);
968 fprintf(stdout
, "Contraction image size: %u\n", (int)t
->image
->contractionSize
);
969 fprintf(stdout
, "Expansions size: %i\n", (int)t
->expansions
->position
);
973 /* produce canonical closure for table */
974 /* first set up constants for implicit calculation */
975 uprv_uca_initImplicitConstants(consts
.UCA_PRIMARY_IMPLICIT_MIN
, consts
.UCA_PRIMARY_IMPLICIT_MAX
, status
);
977 int32_t noOfClosures
= uprv_uca_canonicalClosure(t
, status
);
978 if(noOfClosures
!= 0) {
979 fprintf(stderr
, "Warning: %i canonical closures occured!\n", (int)noOfClosures
);
983 UCATableHeader
*myData
= uprv_uca_assembleTable(t
, status
);
986 fprintf(stdout
, "Compacted data breakdown:\n");
987 /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
988 fprintf(stdout
, "Number of contractions: %u\n", (int)noOfContractions
);
989 fprintf(stdout
, "Contraction image size: %u\n", (int)t
->image
->contractionSize
);
990 fprintf(stdout
, "Expansions size: %i\n", (int)t
->expansions
->position
);
993 if(U_FAILURE(*status
)) {
994 fprintf(stderr
, "Error creating table: %s\n", u_errorName(*status
));
998 /* populate the version info struct with version info*/
999 myData
->version
[0] = UCOL_BUILDER_VERSION
;
1000 myData
->version
[1] = UCAVersion
[0];
1001 myData
->version
[2] = UCAVersion
[1];
1002 myData
->version
[3] = UCAVersion
[2];
1003 /*TODO:The fractional rules version should be taken from FractionalUCA.txt*/
1004 // Removed this macro. Instead, we use the fields below
1005 //myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION;
1006 //myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt
1007 uprv_memcpy(myData
->UCAVersion
, UCAVersion
, sizeof(UVersionInfo
));
1008 u_getUnicodeVersion(myData
->UCDVersion
);
1010 writeOutData(myData
, &consts
, contractionCEs
, noOfContractions
, outputDir
, copyright
, status
);
1012 InverseUCATableHeader
*inverse
= assembleInverseTable(status
);
1013 uprv_memcpy(inverse
->UCAVersion
, UCAVersion
, sizeof(UVersionInfo
));
1014 writeOutInverseData(inverse
, outputDir
, copyright
, status
);
1016 uprv_uca_closeTempTable(t
);
1028 #endif /* #if !UCONFIG_NO_COLLATION */
1030 static UOption options
[]={
1031 UOPTION_HELP_H
, /* 0 Numbers for those who*/
1032 UOPTION_HELP_QUESTION_MARK
, /* 1 can't count. */
1033 UOPTION_COPYRIGHT
, /* 2 */
1034 UOPTION_VERSION
, /* 3 */
1035 UOPTION_DESTDIR
, /* 4 */
1036 UOPTION_SOURCEDIR
, /* 5 */
1037 UOPTION_VERBOSE
, /* 6 */
1038 UOPTION_ICUDATADIR
/* 7 */
1039 /* weiv can't count :))))) */
1042 int main(int argc
, char* argv
[]) {
1043 UErrorCode status
= U_ZERO_ERROR
;
1044 const char* destdir
= NULL
;
1045 const char* srcDir
= NULL
;
1047 char *basename
= NULL
;
1048 const char *copyright
= NULL
;
1049 uprv_memset(&UCAVersion
, 0, 4);
1051 U_MAIN_INIT_ARGS(argc
, argv
);
1053 /* preset then read command line options */
1054 options
[4].value
=u_getDataDirectory();
1055 options
[5].value
="";
1056 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
1058 /* error handling, printing usage message */
1061 "error in command line argument \"%s\"\n",
1066 if(options
[0].doesOccur
|| options
[1].doesOccur
) {
1068 "usage: %s [-options] file\n"
1069 "\tRead in UCA collation text data and write out the binary collation data\n"
1071 "\t-h or -? or --help this usage text\n"
1072 "\t-V or --version show a version message\n"
1073 "\t-c or --copyright include a copyright notice\n"
1074 "\t-d or --destdir destination directory, followed by the path\n"
1075 "\t-s or --sourcedir source directory, followed by the path\n"
1076 "\t-v or --verbose turn on verbose output\n"
1077 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
1078 "\t followed by path, defaults to %s\n",
1079 argv
[0], u_getDataDirectory());
1080 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
1082 if(options
[3].doesOccur
) {
1083 fprintf(stdout
, "genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n",
1084 #if UCONFIG_NO_COLLATION
1087 UCA_FORMAT_VERSION_0
, UCA_FORMAT_VERSION_1
1090 fprintf(stdout
, U_COPYRIGHT_STRING
"\n");
1094 /* get the options values */
1095 destdir
= options
[4].value
;
1096 srcDir
= options
[5].value
;
1097 VERBOSE
= options
[6].doesOccur
;
1099 if (options
[2].doesOccur
) {
1100 copyright
= U_COPYRIGHT_STRING
;
1103 if (options
[7].doesOccur
) {
1104 u_setDataDirectory(options
[7].value
);
1106 /* Initialize ICU */
1108 if (U_FAILURE(status
) && status
!= U_FILE_ACCESS_ERROR
) {
1109 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
1110 argv
[0], u_errorName(status
));
1113 status
= U_ZERO_ERROR
;
1116 /* prepare the filename beginning with the source dir */
1117 uprv_strcpy(filename
, srcDir
);
1118 basename
=filename
+uprv_strlen(filename
);
1120 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
1121 *basename
++ = U_FILE_SEP_CHAR
;
1125 uprv_strcpy(basename
, "FractionalUCA.txt");
1128 uprv_strcpy(basename
, getLongPathname(*argv
));
1132 if(u_getCombiningClass(0x0053) == 0)
1134 fprintf(stderr
, "SEVERE ERROR: Normalization data is not functioning! Bailing out. Was not able to load unorm.dat.\n");
1139 #if UCONFIG_NO_COLLATION
1141 UNewDataMemory
*pData
;
1144 msg
= "genuca writes dummy " UCA_DATA_NAME
"." UCA_DATA_TYPE
" because of UCONFIG_NO_COLLATION, see uconfig.h";
1145 fprintf(stderr
, "%s\n", msg
);
1146 pData
= udata_create(destdir
, UCA_DATA_TYPE
, UCA_DATA_NAME
, &dummyDataInfo
,
1148 udata_writeBlock(pData
, msg
, strlen(msg
));
1149 udata_finish(pData
, &status
);
1151 msg
= "genuca writes dummy " INVC_DATA_NAME
"." INVC_DATA_TYPE
" because of UCONFIG_NO_COLLATION, see uconfig.h";
1152 fprintf(stderr
, "%s\n", msg
);
1153 pData
= udata_create(destdir
, INVC_DATA_TYPE
, INVC_DATA_NAME
, &dummyDataInfo
,
1155 udata_writeBlock(pData
, msg
, strlen(msg
));
1156 udata_finish(pData
, &status
);
1162 return write_uca_table(filename
, destdir
, copyright
, &status
);
1168 * Hey, Emacs, please set the following:
1171 * indent-tabs-mode: nil