2 *******************************************************************************
4 * Copyright (C) 1999-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: genprops.c
10 * tab size: 8 (not used)
13 * created on: 1999dec08
14 * created by: Markus W. Scherer
16 * This program reads several of the Unicode character database text files,
17 * parses them, and extracts most of the properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
25 #include "unicode/utypes.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uset.h"
28 #include "unicode/putil.h"
29 #include "unicode/uclean.h"
42 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
44 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
47 * Unicode set collecting the case-sensitive characters;
48 * see uchar.h UCHAR_CASE_SENSITIVE.
49 * Add code points from case mappings/foldings in
50 * the root locale and with default options.
52 static USet
*caseSensitive
;
54 /* prototypes --------------------------------------------------------------- */
57 parseBidiMirroring(const char *filename
, UErrorCode
*pErrorCode
);
60 parseSpecialCasing(const char *filename
, UErrorCode
*pErrorCode
);
63 parseCaseFolding(const char *filename
, UErrorCode
*pErrorCode
);
66 parseDB(const char *filename
, UErrorCode
*pErrorCode
);
68 /* -------------------------------------------------------------------------- */
83 /* Keep these values in sync with the above enums */
84 static UOption options
[]={
86 UOPTION_HELP_QUESTION_MARK
,
91 { "unicode", NULL
, NULL
, NULL
, 'u', UOPT_REQUIRES_ARG
, 0 },
96 main(int argc
, char* argv
[]) {
98 const char *srcDir
=NULL
, *destDir
=NULL
, *suffix
=NULL
;
100 UErrorCode errorCode
=U_ZERO_ERROR
;
102 U_MAIN_INIT_ARGS(argc
, argv
);
104 /* preset then read command line options */
105 options
[DESTDIR
].value
=u_getDataDirectory();
106 options
[SOURCEDIR
].value
="";
107 options
[UNICODE_VERSION
].value
="";
108 options
[ICUDATADIR
].value
=u_getDataDirectory();
109 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
111 /* error handling, printing usage message */
114 "error in command line argument \"%s\"\n",
117 if(argc
<0 || options
[HELP_H
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
) {
119 * Broken into chucks because the C89 standard says the minimum
120 * required supported string length is 509 bytes.
123 "Usage: %s [-options] [suffix]\n"
125 "read the UnicodeData.txt file and other Unicode properties files and\n"
126 "create a binary file " DATA_NAME
"." DATA_TYPE
" with the character properties\n"
131 "\t-h or -? or --help this usage text\n"
132 "\t-v or --verbose verbose output\n"
133 "\t-c or --copyright include a copyright notice\n"
134 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
136 "\t-d or --destdir destination directory, followed by the path\n"
137 "\t-s or --sourcedir source directory, followed by the path\n"
138 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
139 "\t followed by path, defaults to %s\n"
140 "\tsuffix suffix that is to be appended with a '-'\n"
141 "\t to the source file basenames before opening;\n"
142 "\t 'genprops new' will read UnicodeData-new.txt etc.\n",
143 u_getDataDirectory());
144 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
147 /* get the options values */
148 beVerbose
=options
[VERBOSE
].doesOccur
;
149 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
150 srcDir
=options
[SOURCEDIR
].value
;
151 destDir
=options
[DESTDIR
].value
;
159 if(options
[UNICODE_VERSION
].doesOccur
) {
160 setUnicodeVersion(options
[UNICODE_VERSION
].value
);
162 /* else use the default dataVersion in store.c */
164 if (options
[ICUDATADIR
].doesOccur
) {
165 u_setDataDirectory(options
[ICUDATADIR
].value
);
168 /* prepare the filename beginning with the source dir */
169 uprv_strcpy(filename
, srcDir
);
170 basename
=filename
+uprv_strlen(filename
);
171 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
172 *basename
++=U_FILE_SEP_CHAR
;
177 caseSensitive
=uset_open(1, 0); /* empty set (start>end) */
179 /* process BidiMirroring.txt */
180 writeUCDFilename(basename
, "BidiMirroring", suffix
);
181 parseBidiMirroring(filename
, &errorCode
);
183 /* process SpecialCasing.txt */
184 writeUCDFilename(basename
, "SpecialCasing", suffix
);
185 parseSpecialCasing(filename
, &errorCode
);
187 /* process CaseFolding.txt */
188 writeUCDFilename(basename
, "CaseFolding", suffix
);
189 parseCaseFolding(filename
, &errorCode
);
191 /* process UnicodeData.txt */
192 writeUCDFilename(basename
, "UnicodeData", suffix
);
193 parseDB(filename
, &errorCode
);
195 /* process additional properties files */
197 generateAdditionalProperties(filename
, suffix
, &errorCode
);
199 /* process parsed data */
200 if(U_SUCCESS(errorCode
)) {
201 /* write the properties data file */
202 generateData(destDir
);
210 writeUCDFilename(char *basename
, const char *filename
, const char *suffix
) {
211 int32_t length
=(int32_t)uprv_strlen(filename
);
212 uprv_strcpy(basename
, filename
);
214 basename
[length
++]='-';
215 uprv_strcpy(basename
+length
, suffix
);
216 length
+=(int32_t)uprv_strlen(suffix
);
218 uprv_strcpy(basename
+length
, ".txt");
222 isToken(const char *token
, const char *s
) {
226 s
=u_skipWhitespace(s
);
233 z
=u_skipWhitespace(s
+j
);
234 if(*z
==';' || *z
==0) {
246 getTokenIndex(const char *const tokens
[], int32_t countTokens
, const char *s
) {
250 s
=u_skipWhitespace(s
);
251 for(i
=0; i
<countTokens
; ++i
) {
260 z
=u_skipWhitespace(s
+j
);
261 if(*z
==';' || *z
==0 || *z
=='#' || *z
=='\r' || *z
=='\n') {
274 _set_addAll(USet
*set
, const UChar
*s
, int32_t length
) {
278 /* needs length>=0 */
279 for(i
=0; i
<length
; /* U16_NEXT advances i */) {
280 U16_NEXT(s
, i
, length
, c
);
285 /* parser for BidiMirroring.txt --------------------------------------------- */
287 #define MAX_MIRROR_COUNT 2000
289 static uint32_t mirrorMappings
[MAX_MIRROR_COUNT
][2];
290 static int32_t mirrorCount
=0;
292 static void U_CALLCONV
293 mirrorLineFn(void *context
,
294 char *fields
[][2], int32_t fieldCount
,
295 UErrorCode
*pErrorCode
) {
297 static uint32_t prevCode
=0;
299 mirrorMappings
[mirrorCount
][0]=(uint32_t)uprv_strtoul(fields
[0][0], &end
, 16);
300 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
301 fprintf(stderr
, "genprops: syntax error in BidiMirroring.txt field 0 at %s\n", fields
[0][0]);
302 *pErrorCode
=U_PARSE_ERROR
;
306 mirrorMappings
[mirrorCount
][1]=(uint32_t)uprv_strtoul(fields
[1][0], &end
, 16);
307 if(end
<=fields
[1][0] || end
!=fields
[1][1]) {
308 fprintf(stderr
, "genprops: syntax error in BidiMirroring.txt field 1 at %s\n", fields
[1][0]);
309 *pErrorCode
=U_PARSE_ERROR
;
313 /* check that the code points (mirrorMappings[mirrorCount][0]) are in ascending order */
314 if(mirrorMappings
[mirrorCount
][0]<=prevCode
&& mirrorMappings
[mirrorCount
][0]>0) {
315 fprintf(stderr
, "genprops: error - BidiMirroring entries out of order, U+%04lx after U+%04lx\n",
316 (unsigned long)mirrorMappings
[mirrorCount
][0],
317 (unsigned long)prevCode
);
318 *pErrorCode
=U_PARSE_ERROR
;
321 prevCode
=mirrorMappings
[mirrorCount
][0];
323 if(++mirrorCount
==MAX_MIRROR_COUNT
) {
324 fprintf(stderr
, "genprops: too many mirror mappings\n");
325 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
326 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
331 parseBidiMirroring(const char *filename
, UErrorCode
*pErrorCode
) {
334 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
338 u_parseDelimitedFile(filename
, ';', fields
, 2, mirrorLineFn
, NULL
, pErrorCode
);
341 /* parser for SpecialCasing.txt --------------------------------------------- */
343 #define MAX_SPECIAL_CASING_COUNT 500
345 static SpecialCasing specialCasings
[MAX_SPECIAL_CASING_COUNT
];
346 static int32_t specialCasingCount
=0;
348 static void U_CALLCONV
349 specialCasingLineFn(void *context
,
350 char *fields
[][2], int32_t fieldCount
,
351 UErrorCode
*pErrorCode
) {
355 specialCasings
[specialCasingCount
].code
=(uint32_t)uprv_strtoul(u_skipWhitespace(fields
[0][0]), &end
, 16);
356 end
=(char *)u_skipWhitespace(end
);
357 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
358 fprintf(stderr
, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields
[0][0]);
359 *pErrorCode
=U_PARSE_ERROR
;
363 /* is this a complex mapping? */
364 if(*(end
=(char *)u_skipWhitespace(fields
[4][0]))!=0 && *end
!=';' && *end
!='#') {
365 /* there is some condition text in the fifth field */
366 specialCasings
[specialCasingCount
].isComplex
=TRUE
;
368 /* do not store any actual mappings for this */
369 specialCasings
[specialCasingCount
].lowerCase
[0]=0;
370 specialCasings
[specialCasingCount
].upperCase
[0]=0;
371 specialCasings
[specialCasingCount
].titleCase
[0]=0;
373 /* just set the "complex" flag and get the case mappings */
374 specialCasings
[specialCasingCount
].isComplex
=FALSE
;
375 specialCasings
[specialCasingCount
].lowerCase
[0]=
376 (UChar
)u_parseString(fields
[1][0], specialCasings
[specialCasingCount
].lowerCase
+1, 31, NULL
, pErrorCode
);
377 specialCasings
[specialCasingCount
].upperCase
[0]=
378 (UChar
)u_parseString(fields
[3][0], specialCasings
[specialCasingCount
].upperCase
+1, 31, NULL
, pErrorCode
);
379 specialCasings
[specialCasingCount
].titleCase
[0]=
380 (UChar
)u_parseString(fields
[2][0], specialCasings
[specialCasingCount
].titleCase
+1, 31, NULL
, pErrorCode
);
381 if(U_FAILURE(*pErrorCode
)) {
382 fprintf(stderr
, "genprops: error parsing special casing at %s\n", fields
[0][0]);
386 uset_add(caseSensitive
, (UChar32
)specialCasings
[specialCasingCount
].code
);
387 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].lowerCase
+1, specialCasings
[specialCasingCount
].lowerCase
[0]);
388 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].upperCase
+1, specialCasings
[specialCasingCount
].upperCase
[0]);
389 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].titleCase
+1, specialCasings
[specialCasingCount
].titleCase
[0]);
392 if(++specialCasingCount
==MAX_SPECIAL_CASING_COUNT
) {
393 fprintf(stderr
, "genprops: too many special casing mappings\n");
394 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
395 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
400 compareSpecialCasings(const void *left
, const void *right
) {
401 return ((const SpecialCasing
*)left
)->code
-((const SpecialCasing
*)right
)->code
;
405 parseSpecialCasing(const char *filename
, UErrorCode
*pErrorCode
) {
409 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
413 u_parseDelimitedFile(filename
, ';', fields
, 5, specialCasingLineFn
, NULL
, pErrorCode
);
415 /* sort the special casing entries by code point */
416 if(specialCasingCount
>0) {
417 qsort(specialCasings
, specialCasingCount
, sizeof(SpecialCasing
), compareSpecialCasings
);
420 /* replace multiple entries for any code point by one "complex" one */
422 for(i
=1; i
<specialCasingCount
; ++i
) {
423 if(specialCasings
[i
-1].code
==specialCasings
[i
].code
) {
424 /* there is a duplicate code point */
425 specialCasings
[i
-1].code
=0x7fffffff; /* remove this entry in the following qsort */
426 specialCasings
[i
].isComplex
=TRUE
; /* make the following one complex */
427 specialCasings
[i
].lowerCase
[0]=0;
428 specialCasings
[i
].upperCase
[0]=0;
429 specialCasings
[i
].titleCase
[0]=0;
434 /* if some entries just were removed, then re-sort */
436 qsort(specialCasings
, specialCasingCount
, sizeof(SpecialCasing
), compareSpecialCasings
);
437 specialCasingCount
-=j
;
441 * Add one complex mapping to caseSensitive that was filtered out above:
442 * Greek final Sigma has a conditional mapping but not locale-sensitive,
443 * and it is taken when lowercasing just U+03A3 alone.
444 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
446 uset_add(caseSensitive
, 0x3c2);
449 /* parser for CaseFolding.txt ----------------------------------------------- */
451 #define MAX_CASE_FOLDING_COUNT 2000
453 static CaseFolding caseFoldings
[MAX_CASE_FOLDING_COUNT
];
454 static int32_t caseFoldingCount
=0;
456 static void U_CALLCONV
457 caseFoldingLineFn(void *context
,
458 char *fields
[][2], int32_t fieldCount
,
459 UErrorCode
*pErrorCode
) {
461 static uint32_t prevCode
=0;
466 caseFoldings
[caseFoldingCount
].code
=(uint32_t)uprv_strtoul(u_skipWhitespace(fields
[0][0]), &end
, 16);
467 end
=(char *)u_skipWhitespace(end
);
468 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
469 fprintf(stderr
, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields
[0][0]);
470 *pErrorCode
=U_PARSE_ERROR
;
474 /* get the status of this mapping */
475 caseFoldings
[caseFoldingCount
].status
=status
=*u_skipWhitespace(fields
[1][0]);
476 if(status
!='L' && status
!='E' && status
!='C' && status
!='S' && status
!='F' && status
!='I' && status
!='T') {
477 fprintf(stderr
, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields
[0][0]);
478 *pErrorCode
=U_PARSE_ERROR
;
482 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
487 /* get the mapping */
488 count
=caseFoldings
[caseFoldingCount
].full
[0]=
489 (UChar
)u_parseString(fields
[2][0], caseFoldings
[caseFoldingCount
].full
+1, 31, &caseFoldings
[caseFoldingCount
].simple
, pErrorCode
);
490 if(U_FAILURE(*pErrorCode
)) {
491 fprintf(stderr
, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields
[0][0]);
495 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
496 if(count
==0 || count
>2 || (count
==2 && UTF_IS_SINGLE(caseFoldings
[caseFoldingCount
].full
[1]))) {
497 caseFoldings
[caseFoldingCount
].simple
=0;
500 /* update the case-sensitive set */
502 uset_add(caseSensitive
, (UChar32
)caseFoldings
[caseFoldingCount
].code
);
503 _set_addAll(caseSensitive
, caseFoldings
[caseFoldingCount
].full
+1, caseFoldings
[caseFoldingCount
].full
[0]);
506 /* check the status */
508 /* check if there was a full mapping for this code point before */
509 if( caseFoldingCount
>0 &&
510 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
&&
511 caseFoldings
[caseFoldingCount
-1].status
=='F'
513 /* merge the two entries */
514 caseFoldings
[caseFoldingCount
-1].simple
=caseFoldings
[caseFoldingCount
].simple
;
517 } else if(status
=='F') {
518 /* check if there was a simple mapping for this code point before */
519 if( caseFoldingCount
>0 &&
520 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
&&
521 caseFoldings
[caseFoldingCount
-1].status
=='S'
523 /* merge the two entries */
524 uprv_memcpy(caseFoldings
[caseFoldingCount
-1].full
, caseFoldings
[caseFoldingCount
].full
, 32*U_SIZEOF_UCHAR
);
527 } else if(status
=='I' || status
=='T') {
528 /* check if there was a default mapping for this code point before (remove it) */
529 while(caseFoldingCount
>0 &&
530 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
535 /* store only a marker for special handling for cases like dotless i */
536 caseFoldings
[caseFoldingCount
].simple
=0;
537 caseFoldings
[caseFoldingCount
].full
[0]=0;
540 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
541 if(caseFoldings
[caseFoldingCount
].code
<=prevCode
&& caseFoldings
[caseFoldingCount
].code
>0) {
542 fprintf(stderr
, "genprops: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
543 (unsigned long)caseFoldings
[caseFoldingCount
].code
,
544 (unsigned long)prevCode
);
545 *pErrorCode
=U_PARSE_ERROR
;
548 prevCode
=caseFoldings
[caseFoldingCount
].code
;
550 if(++caseFoldingCount
==MAX_CASE_FOLDING_COUNT
) {
551 fprintf(stderr
, "genprops: too many case folding mappings\n");
552 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
553 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
558 parseCaseFolding(const char *filename
, UErrorCode
*pErrorCode
) {
561 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
565 u_parseDelimitedFile(filename
, ';', fields
, 3, caseFoldingLineFn
, NULL
, pErrorCode
);
568 /* parser for UnicodeData.txt ----------------------------------------------- */
570 /* general categories */
572 genCategoryNames
[U_CHAR_CATEGORY_COUNT
]={
574 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
575 "Mc", "Nd", "Nl", "No",
577 "Cc", "Cf", "Co", "Cs",
578 "Pd", "Ps", "Pe", "Pc", "Po",
579 "Sm", "Sc", "Sk", "So",
584 bidiNames
[U_CHAR_DIRECTION_COUNT
]={
585 "L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S",
586 "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
590 decompositionTypeNames
[U_DT_COUNT
]={
612 uint32_t first
, last
, props
;
616 static int32_t unicodeAreaIndex
=0, mirrorIndex
=0, specialCasingIndex
=0, caseFoldingIndex
=0;
618 static void U_CALLCONV
619 unicodeDataLineFn(void *context
,
620 char *fields
[][2], int32_t fieldCount
,
621 UErrorCode
*pErrorCode
) {
624 static uint32_t prevCode
=0;
628 /* reset the properties */
629 uprv_memset(&p
, 0, sizeof(Props
));
631 /* get the character code, field 0 */
632 p
.code
=(uint32_t)uprv_strtoul(fields
[0][0], &end
, 16);
633 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
634 fprintf(stderr
, "genprops: syntax error in field 0 at %s\n", fields
[0][0]);
635 *pErrorCode
=U_PARSE_ERROR
;
639 /* get general category, field 2 */
640 i
=getTokenIndex(genCategoryNames
, U_CHAR_CATEGORY_COUNT
, fields
[2][0]);
642 p
.generalCategory
=(uint8_t)i
;
644 fprintf(stderr
, "genprops: unknown general category \"%s\" at code 0x%lx\n",
645 fields
[2][0], (unsigned long)p
.code
);
646 *pErrorCode
=U_PARSE_ERROR
;
650 /* get BiDi category, field 4 */
651 i
=getTokenIndex(bidiNames
, U_CHAR_DIRECTION_COUNT
, fields
[4][0]);
655 fprintf(stderr
, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
656 fields
[4][0], (unsigned long)p
.code
);
657 *pErrorCode
=U_PARSE_ERROR
;
661 /* get decomposition type, field 5 */
662 if(fields
[5][0]<fields
[5][1]) {
663 /* there is some decomposition */
664 if(*fields
[5][0]!='<') {
668 /* get compatibility type */
670 while(end
<fields
[5][1] && *end
!='>') {
674 i
=getTokenIndex(decompositionTypeNames
, U_DT_COUNT
, fields
[5][0]+1);
676 fprintf(stderr
, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
677 fields
[5][0], (unsigned long)p
.code
);
678 *pErrorCode
=U_PARSE_ERROR
;
682 if(!upvec_setValue(pv
, p
.code
, p
.code
+1, 2, (uint32_t)i
, UPROPS_DT_MASK
, pErrorCode
)) {
683 fprintf(stderr
, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode
));
688 /* decimal digit value, field 6 */
689 if(fields
[6][0]<fields
[6][1]) {
690 value
=(uint32_t)uprv_strtoul(fields
[6][0], &end
, 10);
691 if(end
!=fields
[6][1] || value
>0x7fff) {
692 fprintf(stderr
, "genprops: syntax error in field 6 at code 0x%lx\n",
693 (unsigned long)p
.code
);
694 *pErrorCode
=U_PARSE_ERROR
;
697 p
.numericValue
=(int32_t)value
;
701 /* digit value, field 7 */
702 if(fields
[7][0]<fields
[7][1]) {
703 value
=(uint32_t)uprv_strtoul(fields
[7][0], &end
, 10);
704 if(end
!=fields
[7][1] || value
>0x7fff) {
705 fprintf(stderr
, "genprops: syntax error in field 7 at code 0x%lx\n",
706 (unsigned long)p
.code
);
707 *pErrorCode
=U_PARSE_ERROR
;
710 if(p
.numericType
==0) {
711 p
.numericValue
=(int32_t)value
;
713 } else if((int32_t)value
!=p
.numericValue
) {
714 fprintf(stderr
, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
715 (unsigned long)p
.code
);
716 *pErrorCode
=U_PARSE_ERROR
;
721 /* numeric value, field 8 */
722 if(fields
[8][0]<fields
[8][1]) {
723 char *s
=fields
[8][0];
726 /* get a possible minus sign */
734 value
=(uint32_t)uprv_strtoul(s
, &end
, 10);
735 if(value
>0 && *end
=='/') {
736 /* field 8 may contain a fractional value, get the denominator */
737 if(p
.numericType
>0) {
738 fprintf(stderr
, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
739 (unsigned long)p
.code
);
740 *pErrorCode
=U_PARSE_ERROR
;
744 p
.denominator
=(uint32_t)uprv_strtoul(end
+1, &end
, 10);
745 if(p
.denominator
==0) {
746 fprintf(stderr
, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
747 (unsigned long)p
.code
);
748 *pErrorCode
=U_PARSE_ERROR
;
752 if(end
!=fields
[8][1] || value
>0x7fffffff) {
753 fprintf(stderr
, "genprops: syntax error in field 8 at code 0x%lx\n",
754 (unsigned long)p
.code
);
755 *pErrorCode
=U_PARSE_ERROR
;
759 if(p
.numericType
==0) {
761 p
.numericValue
=-(int32_t)value
;
763 p
.numericValue
=(int32_t)value
;
766 } else if((int32_t)value
!=p
.numericValue
) {
767 fprintf(stderr
, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
768 (unsigned long)p
.code
);
769 *pErrorCode
=U_PARSE_ERROR
;
774 /* get Mirrored flag, field 9 */
775 if(*fields
[9][0]=='Y') {
777 } else if(fields
[9][1]-fields
[9][0]!=1 || *fields
[9][0]!='N') {
778 fprintf(stderr
, "genprops: syntax error in field 9 at code 0x%lx\n",
779 (unsigned long)p
.code
);
780 *pErrorCode
=U_PARSE_ERROR
;
784 /* get uppercase mapping, field 12 */
785 value
=(uint32_t)uprv_strtoul(fields
[12][0], &end
, 16);
786 if(end
!=fields
[12][1]) {
787 fprintf(stderr
, "genprops: syntax error in field 12 at code 0x%lx\n",
788 (unsigned long)p
.code
);
789 *pErrorCode
=U_PARSE_ERROR
;
792 if(value
!=0 && value
!=p
.code
) {
794 uset_add(caseSensitive
, (UChar32
)p
.code
);
795 uset_add(caseSensitive
, (UChar32
)value
);
798 /* get lowercase value, field 13 */
799 value
=(uint32_t)uprv_strtoul(fields
[13][0], &end
, 16);
800 if(end
!=fields
[13][1]) {
801 fprintf(stderr
, "genprops: syntax error in field 13 at code 0x%lx\n",
802 (unsigned long)p
.code
);
803 *pErrorCode
=U_PARSE_ERROR
;
806 if(value
!=0 && value
!=p
.code
) {
808 uset_add(caseSensitive
, (UChar32
)p
.code
);
809 uset_add(caseSensitive
, (UChar32
)value
);
812 /* get titlecase value, field 14 */
813 value
=(uint32_t)uprv_strtoul(fields
[14][0], &end
, 16);
814 if(end
!=fields
[14][1]) {
815 fprintf(stderr
, "genprops: syntax error in field 14 at code 0x%lx\n",
816 (unsigned long)p
.code
);
817 *pErrorCode
=U_PARSE_ERROR
;
820 if(value
!=0 && value
!=p
.code
) {
822 uset_add(caseSensitive
, (UChar32
)p
.code
);
823 uset_add(caseSensitive
, (UChar32
)value
);
826 /* set additional properties from previously parsed files */
827 if(mirrorIndex
<mirrorCount
&& p
.code
==mirrorMappings
[mirrorIndex
][0]) {
828 p
.mirrorMapping
=mirrorMappings
[mirrorIndex
++][1];
830 if(specialCasingIndex
<specialCasingCount
&& p
.code
==specialCasings
[specialCasingIndex
].code
) {
831 p
.specialCasing
=specialCasings
+specialCasingIndex
++;
833 p
.specialCasing
=NULL
;
835 if(caseFoldingIndex
<caseFoldingCount
&& p
.code
==caseFoldings
[caseFoldingIndex
].code
) {
836 p
.caseFolding
=caseFoldings
+caseFoldingIndex
++;
838 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
839 if( p
.caseFolding
->status
=='C' &&
840 p
.caseFolding
->simple
==p
.lowerCase
850 if(*fields
[1][0]=='<') {
851 /* first or last entry of a Unicode area */
852 size_t length
=fields
[1][1]-fields
[1][0];
855 /* name too short for an area name */
856 } else if(0==uprv_memcmp(", First>", fields
[1][1]-8, 8)) {
857 /* set the current area */
858 if(unicodeAreas
[unicodeAreaIndex
].first
==0xffffffff) {
860 unicodeAreas
[unicodeAreaIndex
].first
=p
.code
;
861 unicodeAreas
[unicodeAreaIndex
].props
=value
;
862 uprv_memcpy(unicodeAreas
[unicodeAreaIndex
].name
, fields
[1][0]+1, length
);
863 unicodeAreas
[unicodeAreaIndex
].name
[length
]=0;
865 /* error: a previous area is incomplete */
866 fprintf(stderr
, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas
[unicodeAreaIndex
].name
);
867 *pErrorCode
=U_PARSE_ERROR
;
871 } else if(0==uprv_memcmp(", Last>", fields
[1][1]-7, 7)) {
872 /* check that the current area matches, and complete it with the last code point */
874 if( unicodeAreas
[unicodeAreaIndex
].props
==value
&&
875 0==uprv_memcmp(unicodeAreas
[unicodeAreaIndex
].name
, fields
[1][0]+1, length
) &&
876 unicodeAreas
[unicodeAreaIndex
].name
[length
]==0 &&
877 unicodeAreas
[unicodeAreaIndex
].first
<p
.code
879 unicodeAreas
[unicodeAreaIndex
].last
=p
.code
;
881 printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
882 (unsigned long)unicodeAreas
[unicodeAreaIndex
].first
,
883 (unsigned long)unicodeAreas
[unicodeAreaIndex
].last
,
884 unicodeAreas
[unicodeAreaIndex
].name
);
886 unicodeAreas
[++unicodeAreaIndex
].first
=0xffffffff;
888 /* error: different properties between first & last, different area name, first>=last */
889 fprintf(stderr
, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas
[unicodeAreaIndex
].name
);
890 *pErrorCode
=U_PARSE_ERROR
;
895 /* not an area name */
899 /* check for non-character code points */
900 if((p
.code
&0xfffe)==0xfffe || (uint32_t)(p
.code
-0xfdd0)<0x20) {
901 fprintf(stderr
, "genprops: error - properties for non-character code point U+%04lx\n",
902 (unsigned long)p
.code
);
903 *pErrorCode
=U_PARSE_ERROR
;
907 /* check that the code points (p.code) are in ascending order */
908 if(p
.code
<=prevCode
&& p
.code
>0) {
909 fprintf(stderr
, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
910 (unsigned long)p
.code
, (unsigned long)prevCode
);
911 *pErrorCode
=U_PARSE_ERROR
;
916 /* properties for a single code point */
917 addProps(p
.code
, value
);
920 /* set repeated properties for the areas */
925 UBool hasPlane15PUA
, hasPlane16PUA
;
926 UErrorCode errorCode
;
929 * UnicodeData.txt before 3.0.1 did not contain the PUAs on
931 * If that is the case, then we add them here, using the properties
935 hasPlane15PUA
=hasPlane16PUA
=FALSE
;
937 for(i
=0; i
<unicodeAreaIndex
; ++i
) {
938 repeatProps(unicodeAreas
[i
].first
,
939 unicodeAreas
[i
].last
,
940 unicodeAreas
[i
].props
);
941 if(unicodeAreas
[i
].first
==0xe000) {
942 puaProps
=unicodeAreas
[i
].props
;
943 } else if(unicodeAreas
[i
].first
==0xf0000) {
945 } else if(unicodeAreas
[i
].first
==0x100000) {
952 repeatProps(0xf0000, 0xffffd, puaProps
);
955 repeatProps(0x100000, 0x10fffd, puaProps
);
959 /* Hangul have canonical decompositions */
960 errorCode
=U_ZERO_ERROR
;
961 if(!upvec_setValue(pv
, 0xac00, 0xd7a4, 2, (uint32_t)U_DT_CANONICAL
, UPROPS_DT_MASK
, &errorCode
)) {
962 fprintf(stderr
, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode
));
968 parseDB(const char *filename
, UErrorCode
*pErrorCode
) {
969 /* default Bidi classes for unassigned code points */
970 static const uint32_t defaultBidi
[][2]={ /* { limit, class } */
971 { 0x0590, U_LEFT_TO_RIGHT
},
972 { 0x0600, U_RIGHT_TO_LEFT
},
973 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC
},
974 { 0xFB1D, U_LEFT_TO_RIGHT
},
975 { 0xFB50, U_RIGHT_TO_LEFT
},
976 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC
},
977 { 0xFE70, U_LEFT_TO_RIGHT
},
978 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC
},
979 { 0x110000, U_LEFT_TO_RIGHT
}
987 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
992 * Set default Bidi classes for unassigned code points.
993 * See table 3-7 "Bidirectional Character Types" in UAX #9.
994 * http://www.unicode.org/reports/tr9/
997 for(i
=0; i
<LENGTHOF(defaultBidi
); ++i
) {
998 if(defaultBidi
[i
][1]!=0) {
999 repeatProps(prev
, defaultBidi
[i
][0]-1, defaultBidi
[i
][1]<<UPROPS_BIDI_SHIFT
);
1001 prev
=defaultBidi
[i
][0];
1004 /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
1005 unicodeAreas
[0].first
=0xffffffff;
1007 u_parseDelimitedFile(filename
, ';', fields
, 15, unicodeDataLineFn
, NULL
, pErrorCode
);
1009 if(unicodeAreas
[unicodeAreaIndex
].first
!=0xffffffff) {
1010 fprintf(stderr
, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
1011 unicodeAreas
[unicodeAreaIndex
].name
,
1012 (unsigned long)unicodeAreas
[unicodeAreaIndex
].first
);
1013 *pErrorCode
=U_PARSE_ERROR
;
1014 exit(U_PARSE_ERROR
);
1019 /* are all sub-properties consumed? */
1020 if(mirrorIndex
<mirrorCount
) {
1021 fprintf(stderr
, "genprops: error - some code points in BidiMirroring.txt are missing from UnicodeData.txt\n");
1022 *pErrorCode
=U_PARSE_ERROR
;
1023 exit(U_PARSE_ERROR
);
1025 if(specialCasingIndex
<specialCasingCount
) {
1026 fprintf(stderr
, "genprops: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
1027 *pErrorCode
=U_PARSE_ERROR
;
1028 exit(U_PARSE_ERROR
);
1030 if(caseFoldingIndex
<caseFoldingCount
) {
1031 fprintf(stderr
, "genprops: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
1032 *pErrorCode
=U_PARSE_ERROR
;
1033 exit(U_PARSE_ERROR
);
1036 if(U_FAILURE(*pErrorCode
)) {
1041 0==uset_getItem(caseSensitive
, i
, &start
, &end
, NULL
, 0, pErrorCode
) && U_SUCCESS(*pErrorCode
);
1044 addCaseSensitive(start
, end
);
1046 if(*pErrorCode
==U_INDEX_OUTOFBOUNDS_ERROR
) {
1047 *pErrorCode
=U_ZERO_ERROR
;
1052 * Hey, Emacs, please set the following:
1055 * indent-tabs-mode: nil