2 *******************************************************************************
4 * Copyright (C) 2004-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
16 * This program reads several of the Unicode character database text files,
17 * parses them, and the case mapping properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/uset.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
41 /* data --------------------------------------------------------------------- */
45 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
48 * Unicode set collecting the case-sensitive characters;
49 * see uchar.h UCHAR_CASE_SENSITIVE.
50 * Add code points from case mappings/foldings in
51 * the root locale and with default options.
53 static USet
*caseSensitive
;
55 /* prototypes --------------------------------------------------------------- */
58 parseSpecialCasing(const char *filename
, UErrorCode
*pErrorCode
);
61 parseCaseFolding(const char *filename
, UErrorCode
*pErrorCode
);
64 parseDB(const char *filename
, UErrorCode
*pErrorCode
);
66 /* parse files with multiple binary properties ------------------------------ */
68 /* TODO: more common code, move functions to uparse.h|c */
70 /* TODO: similar to genprops/props2.c but not the same */
75 uint32_t vecValue
, vecMask
;
77 typedef struct Binary Binary
;
81 const Binary
*binaries
;
82 int32_t binariesCount
;
84 typedef struct Binaries Binaries
;
88 { "Soft_Dotted", 0, UCASE_SOFT_DOTTED
, UCASE_DOT_MASK
}
93 "PropList", propListNames
, LENGTHOF(propListNames
)
98 { "Lowercase", 0, UCASE_LOWER
, UCASE_TYPE_MASK
},
99 { "Uppercase", 0, UCASE_UPPER
, UCASE_TYPE_MASK
}
102 static const Binaries
103 derCorePropsBinaries
={
104 "DerivedCoreProperties", derCorePropsNames
, LENGTHOF(derCorePropsNames
)
108 * Treat Word_Break=MidLetter and MidNumLet as a single binary property.
109 * We need not distinguish between them because both add to case-ignorable.
110 * We ignore all other Word_Break values.
114 { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT
), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT
) },
115 { "MidNumLet", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT
), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT
) }
118 static const Binaries
120 "WordBreakProperty", wordBreakNames
, LENGTHOF(wordBreakNames
)
123 static void U_CALLCONV
124 binariesLineFn(void *context
,
125 char *fields
[][2], int32_t fieldCount
,
126 UErrorCode
*pErrorCode
) {
129 uint32_t start
, limit
;
132 bin
=(const Binaries
*)context
;
134 u_parseCodePointRange(fields
[0][0], &start
, &limit
, pErrorCode
);
135 if(U_FAILURE(*pErrorCode
)) {
136 fprintf(stderr
, "gencase: syntax error in %s.txt field 0 at %s\n", bin
->ucdFile
, fields
[0][0]);
141 /* parse binary property name */
142 s
=(char *)u_skipWhitespace(fields
[1][0]);
144 if(i
==bin
->binariesCount
) {
145 /* ignore unrecognized properties */
148 if(isToken(bin
->binaries
[i
].propName
, s
)) {
153 if(bin
->binaries
[i
].vecMask
==0) {
154 fprintf(stderr
, "gencase error: mask value %d==0 for %s %s\n",
155 (int)bin
->binaries
[i
].vecMask
, bin
->ucdFile
, bin
->binaries
[i
].propName
);
156 exit(U_INTERNAL_PROGRAM_ERROR
);
159 if(!upvec_setValue(pv
, start
, limit
, bin
->binaries
[i
].vecWord
, bin
->binaries
[i
].vecValue
, bin
->binaries
[i
].vecMask
, pErrorCode
)) {
160 fprintf(stderr
, "gencase error: unable to set %s, code: %s\n",
161 bin
->binaries
[i
].propName
, u_errorName(*pErrorCode
));
167 parseBinariesFile(char *filename
, char *basename
, const char *suffix
,
169 UErrorCode
*pErrorCode
) {
172 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
176 writeUCDFilename(basename
, bin
->ucdFile
, suffix
);
178 u_parseDelimitedFile(filename
, ';', fields
, 2, binariesLineFn
, (void *)bin
, pErrorCode
);
179 if(U_FAILURE(*pErrorCode
)) {
180 fprintf(stderr
, "error parsing %s.txt: %s\n", bin
->ucdFile
, u_errorName(*pErrorCode
));
184 /* -------------------------------------------------------------------------- */
199 /* Keep these values in sync with the above enums */
200 static UOption options
[]={
202 UOPTION_HELP_QUESTION_MARK
,
207 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG
),
209 UOPTION_DEF("csource", 'C', UOPT_NO_ARG
)
213 main(int argc
, char* argv
[]) {
215 const char *srcDir
=NULL
, *destDir
=NULL
, *suffix
=NULL
;
217 UErrorCode errorCode
=U_ZERO_ERROR
;
219 U_MAIN_INIT_ARGS(argc
, argv
);
221 /* preset then read command line options */
222 options
[DESTDIR
].value
=u_getDataDirectory();
223 options
[SOURCEDIR
].value
="";
224 options
[UNICODE_VERSION
].value
="";
225 options
[ICUDATADIR
].value
=u_getDataDirectory();
226 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
228 /* error handling, printing usage message */
231 "error in command line argument \"%s\"\n",
234 if(argc
<0 || options
[HELP_H
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
) {
236 * Broken into chucks because the C89 standard says the minimum
237 * required supported string length is 509 bytes.
240 "Usage: %s [-options] [suffix]\n"
242 "read the UnicodeData.txt file and other Unicode properties files and\n"
243 "create a binary file " UCASE_DATA_NAME
"." UCASE_DATA_TYPE
" with the case mapping properties\n"
248 "\t-h or -? or --help this usage text\n"
249 "\t-v or --verbose verbose output\n"
250 "\t-c or --copyright include a copyright notice\n"
251 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
252 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
254 "\t-d or --destdir destination directory, followed by the path\n"
255 "\t-s or --sourcedir source directory, followed by the path\n"
256 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
257 "\t followed by path, defaults to %s\n"
258 "\tsuffix suffix that is to be appended with a '-'\n"
259 "\t to the source file basenames before opening;\n"
260 "\t 'gencase new' will read UnicodeData-new.txt etc.\n",
261 u_getDataDirectory());
262 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
265 /* get the options values */
266 beVerbose
=options
[VERBOSE
].doesOccur
;
267 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
268 srcDir
=options
[SOURCEDIR
].value
;
269 destDir
=options
[DESTDIR
].value
;
277 if(options
[UNICODE_VERSION
].doesOccur
) {
278 setUnicodeVersion(options
[UNICODE_VERSION
].value
);
280 /* else use the default dataVersion in store.c */
282 if (options
[ICUDATADIR
].doesOccur
) {
283 u_setDataDirectory(options
[ICUDATADIR
].value
);
286 /* prepare the filename beginning with the source dir */
287 uprv_strcpy(filename
, srcDir
);
288 basename
=filename
+uprv_strlen(filename
);
289 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
290 *basename
++=U_FILE_SEP_CHAR
;
294 pv
=upvec_open(2, 10000);
295 caseSensitive
=uset_open(1, 0); /* empty set (start>end) */
297 /* process SpecialCasing.txt */
298 writeUCDFilename(basename
, "SpecialCasing", suffix
);
299 parseSpecialCasing(filename
, &errorCode
);
301 /* process CaseFolding.txt */
302 writeUCDFilename(basename
, "CaseFolding", suffix
);
303 parseCaseFolding(filename
, &errorCode
);
305 /* process additional properties files */
308 parseBinariesFile(filename
, basename
, suffix
, &propListBinaries
, &errorCode
);
310 parseBinariesFile(filename
, basename
, suffix
, &derCorePropsBinaries
, &errorCode
);
312 if(ucdVersion
>=UNI_4_1
) {
313 parseBinariesFile(filename
, basename
, suffix
, &wordBreakBinaries
, &errorCode
);
316 /* process UnicodeData.txt */
317 writeUCDFilename(basename
, "UnicodeData", suffix
);
318 parseDB(filename
, &errorCode
);
320 /* process parsed data */
325 if(U_SUCCESS(errorCode
)) {
326 /* write the properties data file */
327 generateData(destDir
, options
[CSOURCE
].doesOccur
);
335 writeUCDFilename(char *basename
, const char *filename
, const char *suffix
) {
336 int32_t length
=(int32_t)uprv_strlen(filename
);
337 uprv_strcpy(basename
, filename
);
339 basename
[length
++]='-';
340 uprv_strcpy(basename
+length
, suffix
);
341 length
+=(int32_t)uprv_strlen(suffix
);
343 uprv_strcpy(basename
+length
, ".txt");
346 /* TODO: move to toolutil */
348 isToken(const char *token
, const char *s
) {
352 s
=u_skipWhitespace(s
);
359 z
=u_skipWhitespace(s
+j
);
360 if(*z
==';' || *z
==0) {
372 getTokenIndex(const char *const tokens
[], int32_t countTokens
, const char *s
) {
376 s
=u_skipWhitespace(s
);
377 for(i
=0; i
<countTokens
; ++i
) {
386 z
=u_skipWhitespace(s
+j
);
387 if(*z
==';' || *z
==0 || *z
=='#' || *z
=='\r' || *z
=='\n') {
400 _set_addAll(USet
*set
, const UChar
*s
, int32_t length
) {
404 /* needs length>=0 */
405 for(i
=0; i
<length
; /* U16_NEXT advances i */) {
406 U16_NEXT(s
, i
, length
, c
);
411 /* parser for SpecialCasing.txt --------------------------------------------- */
413 #define MAX_SPECIAL_CASING_COUNT 500
415 static SpecialCasing specialCasings
[MAX_SPECIAL_CASING_COUNT
];
416 static int32_t specialCasingCount
=0;
418 static void U_CALLCONV
419 specialCasingLineFn(void *context
,
420 char *fields
[][2], int32_t fieldCount
,
421 UErrorCode
*pErrorCode
) {
425 specialCasings
[specialCasingCount
].code
=(UChar32
)uprv_strtoul(u_skipWhitespace(fields
[0][0]), &end
, 16);
426 end
=(char *)u_skipWhitespace(end
);
427 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
428 fprintf(stderr
, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields
[0][0]);
429 *pErrorCode
=U_PARSE_ERROR
;
433 /* is this a complex mapping? */
434 if(*(end
=(char *)u_skipWhitespace(fields
[4][0]))!=0 && *end
!=';' && *end
!='#') {
435 /* there is some condition text in the fifth field */
436 specialCasings
[specialCasingCount
].isComplex
=TRUE
;
438 /* do not store any actual mappings for this */
439 specialCasings
[specialCasingCount
].lowerCase
[0]=0;
440 specialCasings
[specialCasingCount
].upperCase
[0]=0;
441 specialCasings
[specialCasingCount
].titleCase
[0]=0;
443 /* just set the "complex" flag and get the case mappings */
444 specialCasings
[specialCasingCount
].isComplex
=FALSE
;
445 specialCasings
[specialCasingCount
].lowerCase
[0]=
446 (UChar
)u_parseString(fields
[1][0], specialCasings
[specialCasingCount
].lowerCase
+1, 31, NULL
, pErrorCode
);
447 specialCasings
[specialCasingCount
].upperCase
[0]=
448 (UChar
)u_parseString(fields
[3][0], specialCasings
[specialCasingCount
].upperCase
+1, 31, NULL
, pErrorCode
);
449 specialCasings
[specialCasingCount
].titleCase
[0]=
450 (UChar
)u_parseString(fields
[2][0], specialCasings
[specialCasingCount
].titleCase
+1, 31, NULL
, pErrorCode
);
451 if(U_FAILURE(*pErrorCode
)) {
452 fprintf(stderr
, "gencase: error parsing special casing at %s\n", fields
[0][0]);
456 uset_add(caseSensitive
, (UChar32
)specialCasings
[specialCasingCount
].code
);
457 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].lowerCase
+1, specialCasings
[specialCasingCount
].lowerCase
[0]);
458 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].upperCase
+1, specialCasings
[specialCasingCount
].upperCase
[0]);
459 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].titleCase
+1, specialCasings
[specialCasingCount
].titleCase
[0]);
462 if(++specialCasingCount
==MAX_SPECIAL_CASING_COUNT
) {
463 fprintf(stderr
, "gencase: too many special casing mappings\n");
464 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
465 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
469 static int32_t U_CALLCONV
470 compareSpecialCasings(const void *context
, const void *left
, const void *right
) {
471 return ((const SpecialCasing
*)left
)->code
-((const SpecialCasing
*)right
)->code
;
475 parseSpecialCasing(const char *filename
, UErrorCode
*pErrorCode
) {
479 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
483 u_parseDelimitedFile(filename
, ';', fields
, 5, specialCasingLineFn
, NULL
, pErrorCode
);
485 /* sort the special casing entries by code point */
486 if(specialCasingCount
>0) {
487 uprv_sortArray(specialCasings
, specialCasingCount
, sizeof(SpecialCasing
),
488 compareSpecialCasings
, NULL
, FALSE
, pErrorCode
);
490 if(U_FAILURE(*pErrorCode
)) {
494 /* replace multiple entries for any code point by one "complex" one */
496 for(i
=1; i
<specialCasingCount
; ++i
) {
497 if(specialCasings
[i
-1].code
==specialCasings
[i
].code
) {
498 /* there is a duplicate code point */
499 specialCasings
[i
-1].code
=0x7fffffff; /* remove this entry in the following sorting */
500 specialCasings
[i
].isComplex
=TRUE
; /* make the following one complex */
501 specialCasings
[i
].lowerCase
[0]=0;
502 specialCasings
[i
].upperCase
[0]=0;
503 specialCasings
[i
].titleCase
[0]=0;
508 /* if some entries just were removed, then re-sort */
510 uprv_sortArray(specialCasings
, specialCasingCount
, sizeof(SpecialCasing
),
511 compareSpecialCasings
, NULL
, FALSE
, pErrorCode
);
512 specialCasingCount
-=j
;
514 if(U_FAILURE(*pErrorCode
)) {
519 * Add one complex mapping to caseSensitive that was filtered out above:
520 * Greek final Sigma has a conditional mapping but not locale-sensitive,
521 * and it is taken when lowercasing just U+03A3 alone.
522 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
524 uset_add(caseSensitive
, 0x3c2);
527 /* parser for CaseFolding.txt ----------------------------------------------- */
529 #define MAX_CASE_FOLDING_COUNT 2000
531 static CaseFolding caseFoldings
[MAX_CASE_FOLDING_COUNT
];
532 static int32_t caseFoldingCount
=0;
534 static void U_CALLCONV
535 caseFoldingLineFn(void *context
,
536 char *fields
[][2], int32_t fieldCount
,
537 UErrorCode
*pErrorCode
) {
539 static UChar32 prevCode
=0;
544 caseFoldings
[caseFoldingCount
].code
=(UChar32
)uprv_strtoul(u_skipWhitespace(fields
[0][0]), &end
, 16);
545 end
=(char *)u_skipWhitespace(end
);
546 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
547 fprintf(stderr
, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields
[0][0]);
548 *pErrorCode
=U_PARSE_ERROR
;
552 /* get the status of this mapping */
553 caseFoldings
[caseFoldingCount
].status
=status
=*u_skipWhitespace(fields
[1][0]);
554 if(status
!='L' && status
!='E' && status
!='C' && status
!='S' && status
!='F' && status
!='I' && status
!='T') {
555 fprintf(stderr
, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields
[0][0]);
556 *pErrorCode
=U_PARSE_ERROR
;
560 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
565 /* get the mapping */
566 count
=caseFoldings
[caseFoldingCount
].full
[0]=
567 (UChar
)u_parseString(fields
[2][0], caseFoldings
[caseFoldingCount
].full
+1, 31, (uint32_t *)&caseFoldings
[caseFoldingCount
].simple
, pErrorCode
);
568 if(U_FAILURE(*pErrorCode
)) {
569 fprintf(stderr
, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields
[0][0]);
573 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
574 if(count
==0 || count
>2 || (count
==2 && UTF_IS_SINGLE(caseFoldings
[caseFoldingCount
].full
[1]))) {
575 caseFoldings
[caseFoldingCount
].simple
=0;
578 /* update the case-sensitive set */
580 uset_add(caseSensitive
, (UChar32
)caseFoldings
[caseFoldingCount
].code
);
581 _set_addAll(caseSensitive
, caseFoldings
[caseFoldingCount
].full
+1, caseFoldings
[caseFoldingCount
].full
[0]);
584 /* check the status */
586 /* check if there was a full mapping for this code point before */
587 if( caseFoldingCount
>0 &&
588 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
&&
589 caseFoldings
[caseFoldingCount
-1].status
=='F'
591 /* merge the two entries */
592 caseFoldings
[caseFoldingCount
-1].simple
=caseFoldings
[caseFoldingCount
].simple
;
595 } else if(status
=='F') {
596 /* check if there was a simple mapping for this code point before */
597 if( caseFoldingCount
>0 &&
598 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
&&
599 caseFoldings
[caseFoldingCount
-1].status
=='S'
601 /* merge the two entries */
602 uprv_memcpy(caseFoldings
[caseFoldingCount
-1].full
, caseFoldings
[caseFoldingCount
].full
, 32*U_SIZEOF_UCHAR
);
605 } else if(status
=='I' || status
=='T') {
606 /* check if there was a default mapping for this code point before (remove it) */
607 while(caseFoldingCount
>0 &&
608 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
613 /* store only a marker for special handling for cases like dotless i */
614 caseFoldings
[caseFoldingCount
].simple
=0;
615 caseFoldings
[caseFoldingCount
].full
[0]=0;
618 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
619 if(caseFoldings
[caseFoldingCount
].code
<=prevCode
&& caseFoldings
[caseFoldingCount
].code
>0) {
620 fprintf(stderr
, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
621 (unsigned long)caseFoldings
[caseFoldingCount
].code
,
622 (unsigned long)prevCode
);
623 *pErrorCode
=U_PARSE_ERROR
;
626 prevCode
=caseFoldings
[caseFoldingCount
].code
;
628 if(++caseFoldingCount
==MAX_CASE_FOLDING_COUNT
) {
629 fprintf(stderr
, "gencase: too many case folding mappings\n");
630 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
631 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
636 parseCaseFolding(const char *filename
, UErrorCode
*pErrorCode
) {
639 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
643 u_parseDelimitedFile(filename
, ';', fields
, 3, caseFoldingLineFn
, NULL
, pErrorCode
);
646 /* parser for UnicodeData.txt ----------------------------------------------- */
648 /* general categories */
650 genCategoryNames
[U_CHAR_CATEGORY_COUNT
]={
652 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
653 "Mc", "Nd", "Nl", "No",
655 "Cc", "Cf", "Co", "Cs",
656 "Pd", "Ps", "Pe", "Pc", "Po",
657 "Sm", "Sc", "Sk", "So",
661 static int32_t specialCasingIndex
=0, caseFoldingIndex
=0;
663 static void U_CALLCONV
664 unicodeDataLineFn(void *context
,
665 char *fields
[][2], int32_t fieldCount
,
666 UErrorCode
*pErrorCode
) {
669 static UChar32 prevCode
=0;
673 /* reset the properties */
674 uprv_memset(&p
, 0, sizeof(Props
));
676 /* get the character code, field 0 */
677 p
.code
=(UChar32
)uprv_strtoul(fields
[0][0], &end
, 16);
678 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
679 fprintf(stderr
, "gencase: syntax error in field 0 at %s\n", fields
[0][0]);
680 *pErrorCode
=U_PARSE_ERROR
;
684 /* get general category, field 2 */
685 i
=getTokenIndex(genCategoryNames
, U_CHAR_CATEGORY_COUNT
, fields
[2][0]);
689 fprintf(stderr
, "gencase: unknown general category \"%s\" at code 0x%lx\n",
690 fields
[2][0], (unsigned long)p
.code
);
691 *pErrorCode
=U_PARSE_ERROR
;
695 /* get canonical combining class, field 3 */
696 value
=(UChar32
)uprv_strtoul(fields
[3][0], &end
, 10);
697 if(end
<=fields
[3][0] || end
!=fields
[3][1] || value
>0xff) {
698 fprintf(stderr
, "gencase: syntax error in field 3 at %s\n", fields
[0][0]);
699 *pErrorCode
=U_PARSE_ERROR
;
704 /* get uppercase mapping, field 12 */
705 value
=(UChar32
)uprv_strtoul(fields
[12][0], &end
, 16);
706 if(end
!=fields
[12][1]) {
707 fprintf(stderr
, "gencase: syntax error in field 12 at code 0x%lx\n",
708 (unsigned long)p
.code
);
709 *pErrorCode
=U_PARSE_ERROR
;
712 if(value
!=0 && value
!=p
.code
) {
714 uset_add(caseSensitive
, p
.code
);
715 uset_add(caseSensitive
, value
);
718 /* get lowercase value, field 13 */
719 value
=(UChar32
)uprv_strtoul(fields
[13][0], &end
, 16);
720 if(end
!=fields
[13][1]) {
721 fprintf(stderr
, "gencase: syntax error in field 13 at code 0x%lx\n",
722 (unsigned long)p
.code
);
723 *pErrorCode
=U_PARSE_ERROR
;
726 if(value
!=0 && value
!=p
.code
) {
728 uset_add(caseSensitive
, p
.code
);
729 uset_add(caseSensitive
, value
);
732 /* get titlecase value, field 14 */
733 value
=(UChar32
)uprv_strtoul(fields
[14][0], &end
, 16);
734 if(end
!=fields
[14][1]) {
735 fprintf(stderr
, "gencase: syntax error in field 14 at code 0x%lx\n",
736 (unsigned long)p
.code
);
737 *pErrorCode
=U_PARSE_ERROR
;
740 if(value
!=0 && value
!=p
.code
) {
742 uset_add(caseSensitive
, p
.code
);
743 uset_add(caseSensitive
, value
);
746 /* set additional properties from previously parsed files */
747 if(specialCasingIndex
<specialCasingCount
&& p
.code
==specialCasings
[specialCasingIndex
].code
) {
748 p
.specialCasing
=specialCasings
+specialCasingIndex
++;
750 p
.specialCasing
=NULL
;
752 if(caseFoldingIndex
<caseFoldingCount
&& p
.code
==caseFoldings
[caseFoldingIndex
].code
) {
753 p
.caseFolding
=caseFoldings
+caseFoldingIndex
++;
755 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
756 if( p
.caseFolding
->status
=='C' &&
757 p
.caseFolding
->simple
==p
.lowerCase
765 /* check for non-character code points */
766 if((p
.code
&0xfffe)==0xfffe || (uint32_t)(p
.code
-0xfdd0)<0x20) {
767 fprintf(stderr
, "gencase: error - properties for non-character code point U+%04lx\n",
768 (unsigned long)p
.code
);
769 *pErrorCode
=U_PARSE_ERROR
;
773 /* check that the code points (p.code) are in ascending order */
774 if(p
.code
<=prevCode
&& p
.code
>0) {
775 fprintf(stderr
, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
776 (unsigned long)p
.code
, (unsigned long)prevCode
);
777 *pErrorCode
=U_PARSE_ERROR
;
781 /* properties for a single code point */
788 parseDB(const char *filename
, UErrorCode
*pErrorCode
) {
793 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
797 u_parseDelimitedFile(filename
, ';', fields
, 15, unicodeDataLineFn
, NULL
, pErrorCode
);
799 /* are all sub-properties consumed? */
800 if(specialCasingIndex
<specialCasingCount
) {
801 fprintf(stderr
, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
802 *pErrorCode
=U_PARSE_ERROR
;
805 if(caseFoldingIndex
<caseFoldingCount
) {
806 fprintf(stderr
, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
807 *pErrorCode
=U_PARSE_ERROR
;
811 if(U_FAILURE(*pErrorCode
)) {
816 0==uset_getItem(caseSensitive
, i
, &start
, &end
, NULL
, 0, pErrorCode
) && U_SUCCESS(*pErrorCode
);
819 addCaseSensitive(start
, end
);
821 if(*pErrorCode
==U_INDEX_OUTOFBOUNDS_ERROR
) {
822 *pErrorCode
=U_ZERO_ERROR
;
827 * Hey, Emacs, please set the following:
830 * indent-tabs-mode: nil