2 *******************************************************************************
4 * Copyright (C) 2004-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
16 * This program reads several of the Unicode character database text files,
17 * parses them, and the case mapping properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/uset.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
41 /* data --------------------------------------------------------------------- */
45 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
48 * Unicode set collecting the case-sensitive characters;
49 * see uchar.h UCHAR_CASE_SENSITIVE.
50 * Add code points from case mappings/foldings in
51 * the root locale and with default options.
53 static USet
*caseSensitive
;
55 /* prototypes --------------------------------------------------------------- */
58 parseSpecialCasing(const char *filename
, UErrorCode
*pErrorCode
);
61 parseCaseFolding(const char *filename
, UErrorCode
*pErrorCode
);
64 parseDB(const char *filename
, UErrorCode
*pErrorCode
);
66 /* parse files with multiple binary properties ------------------------------ */
68 /* TODO: more common code, move functions to uparse.h|c */
70 /* TODO: similar to genprops/props2.c but not the same */
75 uint32_t vecValue
, vecMask
;
77 typedef struct Binary Binary
;
81 const Binary
*binaries
;
82 int32_t binariesCount
;
84 typedef struct Binaries Binaries
;
88 { "Soft_Dotted", 0, UCASE_SOFT_DOTTED
, UCASE_DOT_MASK
}
93 "PropList", propListNames
, LENGTHOF(propListNames
)
98 { "Lowercase", 0, UCASE_LOWER
, UCASE_TYPE_MASK
},
99 { "Uppercase", 0, UCASE_UPPER
, UCASE_TYPE_MASK
}
102 static const Binaries
103 derCorePropsBinaries
={
104 "DerivedCoreProperties", derCorePropsNames
, LENGTHOF(derCorePropsNames
)
107 /* treat Word_Break=MidLetter as a binary property (we ignore all other Word_Break values) */
110 { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT
), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT
) }
113 static const Binaries
115 "WordBreakProperty", wordBreakNames
, LENGTHOF(wordBreakNames
)
118 static void U_CALLCONV
119 binariesLineFn(void *context
,
120 char *fields
[][2], int32_t fieldCount
,
121 UErrorCode
*pErrorCode
) {
124 uint32_t start
, limit
;
127 bin
=(const Binaries
*)context
;
129 u_parseCodePointRange(fields
[0][0], &start
, &limit
, pErrorCode
);
130 if(U_FAILURE(*pErrorCode
)) {
131 fprintf(stderr
, "gencase: syntax error in %s.txt field 0 at %s\n", bin
->ucdFile
, fields
[0][0]);
136 /* parse binary property name */
137 s
=(char *)u_skipWhitespace(fields
[1][0]);
139 if(i
==bin
->binariesCount
) {
140 /* ignore unrecognized properties */
143 if(isToken(bin
->binaries
[i
].propName
, s
)) {
148 if(bin
->binaries
[i
].vecMask
==0) {
149 fprintf(stderr
, "gencase error: mask value %d==0 for %s %s\n",
150 (int)bin
->binaries
[i
].vecMask
, bin
->ucdFile
, bin
->binaries
[i
].propName
);
151 exit(U_INTERNAL_PROGRAM_ERROR
);
154 if(!upvec_setValue(pv
, start
, limit
, bin
->binaries
[i
].vecWord
, bin
->binaries
[i
].vecValue
, bin
->binaries
[i
].vecMask
, pErrorCode
)) {
155 fprintf(stderr
, "gencase error: unable to set %s, code: %s\n",
156 bin
->binaries
[i
].propName
, u_errorName(*pErrorCode
));
162 parseBinariesFile(char *filename
, char *basename
, const char *suffix
,
164 UErrorCode
*pErrorCode
) {
167 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
171 writeUCDFilename(basename
, bin
->ucdFile
, suffix
);
173 u_parseDelimitedFile(filename
, ';', fields
, 2, binariesLineFn
, (void *)bin
, pErrorCode
);
174 if(U_FAILURE(*pErrorCode
)) {
175 fprintf(stderr
, "error parsing %s.txt: %s\n", bin
->ucdFile
, u_errorName(*pErrorCode
));
179 /* -------------------------------------------------------------------------- */
194 /* Keep these values in sync with the above enums */
195 static UOption options
[]={
197 UOPTION_HELP_QUESTION_MARK
,
202 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG
),
204 UOPTION_DEF("csource", 'C', UOPT_NO_ARG
)
208 main(int argc
, char* argv
[]) {
210 const char *srcDir
=NULL
, *destDir
=NULL
, *suffix
=NULL
;
212 UErrorCode errorCode
=U_ZERO_ERROR
;
214 U_MAIN_INIT_ARGS(argc
, argv
);
216 /* preset then read command line options */
217 options
[DESTDIR
].value
=u_getDataDirectory();
218 options
[SOURCEDIR
].value
="";
219 options
[UNICODE_VERSION
].value
="";
220 options
[ICUDATADIR
].value
=u_getDataDirectory();
221 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
223 /* error handling, printing usage message */
226 "error in command line argument \"%s\"\n",
229 if(argc
<0 || options
[HELP_H
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
) {
231 * Broken into chucks because the C89 standard says the minimum
232 * required supported string length is 509 bytes.
235 "Usage: %s [-options] [suffix]\n"
237 "read the UnicodeData.txt file and other Unicode properties files and\n"
238 "create a binary file " UCASE_DATA_NAME
"." UCASE_DATA_TYPE
" with the case mapping properties\n"
243 "\t-h or -? or --help this usage text\n"
244 "\t-v or --verbose verbose output\n"
245 "\t-c or --copyright include a copyright notice\n"
246 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
247 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
249 "\t-d or --destdir destination directory, followed by the path\n"
250 "\t-s or --sourcedir source directory, followed by the path\n"
251 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
252 "\t followed by path, defaults to %s\n"
253 "\tsuffix suffix that is to be appended with a '-'\n"
254 "\t to the source file basenames before opening;\n"
255 "\t 'gencase new' will read UnicodeData-new.txt etc.\n",
256 u_getDataDirectory());
257 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
260 /* get the options values */
261 beVerbose
=options
[VERBOSE
].doesOccur
;
262 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
263 srcDir
=options
[SOURCEDIR
].value
;
264 destDir
=options
[DESTDIR
].value
;
272 if(options
[UNICODE_VERSION
].doesOccur
) {
273 setUnicodeVersion(options
[UNICODE_VERSION
].value
);
275 /* else use the default dataVersion in store.c */
277 if (options
[ICUDATADIR
].doesOccur
) {
278 u_setDataDirectory(options
[ICUDATADIR
].value
);
281 /* prepare the filename beginning with the source dir */
282 uprv_strcpy(filename
, srcDir
);
283 basename
=filename
+uprv_strlen(filename
);
284 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
285 *basename
++=U_FILE_SEP_CHAR
;
289 pv
=upvec_open(2, 10000);
290 caseSensitive
=uset_open(1, 0); /* empty set (start>end) */
292 /* process SpecialCasing.txt */
293 writeUCDFilename(basename
, "SpecialCasing", suffix
);
294 parseSpecialCasing(filename
, &errorCode
);
296 /* process CaseFolding.txt */
297 writeUCDFilename(basename
, "CaseFolding", suffix
);
298 parseCaseFolding(filename
, &errorCode
);
300 /* process additional properties files */
303 parseBinariesFile(filename
, basename
, suffix
, &propListBinaries
, &errorCode
);
305 parseBinariesFile(filename
, basename
, suffix
, &derCorePropsBinaries
, &errorCode
);
307 if(ucdVersion
>=UNI_4_1
) {
308 parseBinariesFile(filename
, basename
, suffix
, &wordBreakBinaries
, &errorCode
);
311 /* process UnicodeData.txt */
312 writeUCDFilename(basename
, "UnicodeData", suffix
);
313 parseDB(filename
, &errorCode
);
315 /* process parsed data */
320 if(U_SUCCESS(errorCode
)) {
321 /* write the properties data file */
322 generateData(destDir
, options
[CSOURCE
].doesOccur
);
330 writeUCDFilename(char *basename
, const char *filename
, const char *suffix
) {
331 int32_t length
=(int32_t)uprv_strlen(filename
);
332 uprv_strcpy(basename
, filename
);
334 basename
[length
++]='-';
335 uprv_strcpy(basename
+length
, suffix
);
336 length
+=(int32_t)uprv_strlen(suffix
);
338 uprv_strcpy(basename
+length
, ".txt");
341 /* TODO: move to toolutil */
343 isToken(const char *token
, const char *s
) {
347 s
=u_skipWhitespace(s
);
354 z
=u_skipWhitespace(s
+j
);
355 if(*z
==';' || *z
==0) {
367 getTokenIndex(const char *const tokens
[], int32_t countTokens
, const char *s
) {
371 s
=u_skipWhitespace(s
);
372 for(i
=0; i
<countTokens
; ++i
) {
381 z
=u_skipWhitespace(s
+j
);
382 if(*z
==';' || *z
==0 || *z
=='#' || *z
=='\r' || *z
=='\n') {
395 _set_addAll(USet
*set
, const UChar
*s
, int32_t length
) {
399 /* needs length>=0 */
400 for(i
=0; i
<length
; /* U16_NEXT advances i */) {
401 U16_NEXT(s
, i
, length
, c
);
406 /* parser for SpecialCasing.txt --------------------------------------------- */
408 #define MAX_SPECIAL_CASING_COUNT 500
410 static SpecialCasing specialCasings
[MAX_SPECIAL_CASING_COUNT
];
411 static int32_t specialCasingCount
=0;
413 static void U_CALLCONV
414 specialCasingLineFn(void *context
,
415 char *fields
[][2], int32_t fieldCount
,
416 UErrorCode
*pErrorCode
) {
420 specialCasings
[specialCasingCount
].code
=(UChar32
)uprv_strtoul(u_skipWhitespace(fields
[0][0]), &end
, 16);
421 end
=(char *)u_skipWhitespace(end
);
422 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
423 fprintf(stderr
, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields
[0][0]);
424 *pErrorCode
=U_PARSE_ERROR
;
428 /* is this a complex mapping? */
429 if(*(end
=(char *)u_skipWhitespace(fields
[4][0]))!=0 && *end
!=';' && *end
!='#') {
430 /* there is some condition text in the fifth field */
431 specialCasings
[specialCasingCount
].isComplex
=TRUE
;
433 /* do not store any actual mappings for this */
434 specialCasings
[specialCasingCount
].lowerCase
[0]=0;
435 specialCasings
[specialCasingCount
].upperCase
[0]=0;
436 specialCasings
[specialCasingCount
].titleCase
[0]=0;
438 /* just set the "complex" flag and get the case mappings */
439 specialCasings
[specialCasingCount
].isComplex
=FALSE
;
440 specialCasings
[specialCasingCount
].lowerCase
[0]=
441 (UChar
)u_parseString(fields
[1][0], specialCasings
[specialCasingCount
].lowerCase
+1, 31, NULL
, pErrorCode
);
442 specialCasings
[specialCasingCount
].upperCase
[0]=
443 (UChar
)u_parseString(fields
[3][0], specialCasings
[specialCasingCount
].upperCase
+1, 31, NULL
, pErrorCode
);
444 specialCasings
[specialCasingCount
].titleCase
[0]=
445 (UChar
)u_parseString(fields
[2][0], specialCasings
[specialCasingCount
].titleCase
+1, 31, NULL
, pErrorCode
);
446 if(U_FAILURE(*pErrorCode
)) {
447 fprintf(stderr
, "gencase: error parsing special casing at %s\n", fields
[0][0]);
451 uset_add(caseSensitive
, (UChar32
)specialCasings
[specialCasingCount
].code
);
452 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].lowerCase
+1, specialCasings
[specialCasingCount
].lowerCase
[0]);
453 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].upperCase
+1, specialCasings
[specialCasingCount
].upperCase
[0]);
454 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].titleCase
+1, specialCasings
[specialCasingCount
].titleCase
[0]);
457 if(++specialCasingCount
==MAX_SPECIAL_CASING_COUNT
) {
458 fprintf(stderr
, "gencase: too many special casing mappings\n");
459 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
460 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
464 static int32_t U_CALLCONV
465 compareSpecialCasings(const void *context
, const void *left
, const void *right
) {
466 return ((const SpecialCasing
*)left
)->code
-((const SpecialCasing
*)right
)->code
;
470 parseSpecialCasing(const char *filename
, UErrorCode
*pErrorCode
) {
474 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
478 u_parseDelimitedFile(filename
, ';', fields
, 5, specialCasingLineFn
, NULL
, pErrorCode
);
480 /* sort the special casing entries by code point */
481 if(specialCasingCount
>0) {
482 uprv_sortArray(specialCasings
, specialCasingCount
, sizeof(SpecialCasing
),
483 compareSpecialCasings
, NULL
, FALSE
, pErrorCode
);
485 if(U_FAILURE(*pErrorCode
)) {
489 /* replace multiple entries for any code point by one "complex" one */
491 for(i
=1; i
<specialCasingCount
; ++i
) {
492 if(specialCasings
[i
-1].code
==specialCasings
[i
].code
) {
493 /* there is a duplicate code point */
494 specialCasings
[i
-1].code
=0x7fffffff; /* remove this entry in the following sorting */
495 specialCasings
[i
].isComplex
=TRUE
; /* make the following one complex */
496 specialCasings
[i
].lowerCase
[0]=0;
497 specialCasings
[i
].upperCase
[0]=0;
498 specialCasings
[i
].titleCase
[0]=0;
503 /* if some entries just were removed, then re-sort */
505 uprv_sortArray(specialCasings
, specialCasingCount
, sizeof(SpecialCasing
),
506 compareSpecialCasings
, NULL
, FALSE
, pErrorCode
);
507 specialCasingCount
-=j
;
509 if(U_FAILURE(*pErrorCode
)) {
514 * Add one complex mapping to caseSensitive that was filtered out above:
515 * Greek final Sigma has a conditional mapping but not locale-sensitive,
516 * and it is taken when lowercasing just U+03A3 alone.
517 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
519 uset_add(caseSensitive
, 0x3c2);
522 /* parser for CaseFolding.txt ----------------------------------------------- */
524 #define MAX_CASE_FOLDING_COUNT 2000
526 static CaseFolding caseFoldings
[MAX_CASE_FOLDING_COUNT
];
527 static int32_t caseFoldingCount
=0;
529 static void U_CALLCONV
530 caseFoldingLineFn(void *context
,
531 char *fields
[][2], int32_t fieldCount
,
532 UErrorCode
*pErrorCode
) {
534 static UChar32 prevCode
=0;
539 caseFoldings
[caseFoldingCount
].code
=(UChar32
)uprv_strtoul(u_skipWhitespace(fields
[0][0]), &end
, 16);
540 end
=(char *)u_skipWhitespace(end
);
541 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
542 fprintf(stderr
, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields
[0][0]);
543 *pErrorCode
=U_PARSE_ERROR
;
547 /* get the status of this mapping */
548 caseFoldings
[caseFoldingCount
].status
=status
=*u_skipWhitespace(fields
[1][0]);
549 if(status
!='L' && status
!='E' && status
!='C' && status
!='S' && status
!='F' && status
!='I' && status
!='T') {
550 fprintf(stderr
, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields
[0][0]);
551 *pErrorCode
=U_PARSE_ERROR
;
555 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
560 /* get the mapping */
561 count
=caseFoldings
[caseFoldingCount
].full
[0]=
562 (UChar
)u_parseString(fields
[2][0], caseFoldings
[caseFoldingCount
].full
+1, 31, (uint32_t *)&caseFoldings
[caseFoldingCount
].simple
, pErrorCode
);
563 if(U_FAILURE(*pErrorCode
)) {
564 fprintf(stderr
, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields
[0][0]);
568 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
569 if(count
==0 || count
>2 || (count
==2 && UTF_IS_SINGLE(caseFoldings
[caseFoldingCount
].full
[1]))) {
570 caseFoldings
[caseFoldingCount
].simple
=0;
573 /* update the case-sensitive set */
575 uset_add(caseSensitive
, (UChar32
)caseFoldings
[caseFoldingCount
].code
);
576 _set_addAll(caseSensitive
, caseFoldings
[caseFoldingCount
].full
+1, caseFoldings
[caseFoldingCount
].full
[0]);
579 /* check the status */
581 /* check if there was a full mapping for this code point before */
582 if( caseFoldingCount
>0 &&
583 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
&&
584 caseFoldings
[caseFoldingCount
-1].status
=='F'
586 /* merge the two entries */
587 caseFoldings
[caseFoldingCount
-1].simple
=caseFoldings
[caseFoldingCount
].simple
;
590 } else if(status
=='F') {
591 /* check if there was a simple mapping for this code point before */
592 if( caseFoldingCount
>0 &&
593 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
&&
594 caseFoldings
[caseFoldingCount
-1].status
=='S'
596 /* merge the two entries */
597 uprv_memcpy(caseFoldings
[caseFoldingCount
-1].full
, caseFoldings
[caseFoldingCount
].full
, 32*U_SIZEOF_UCHAR
);
600 } else if(status
=='I' || status
=='T') {
601 /* check if there was a default mapping for this code point before (remove it) */
602 while(caseFoldingCount
>0 &&
603 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
608 /* store only a marker for special handling for cases like dotless i */
609 caseFoldings
[caseFoldingCount
].simple
=0;
610 caseFoldings
[caseFoldingCount
].full
[0]=0;
613 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
614 if(caseFoldings
[caseFoldingCount
].code
<=prevCode
&& caseFoldings
[caseFoldingCount
].code
>0) {
615 fprintf(stderr
, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
616 (unsigned long)caseFoldings
[caseFoldingCount
].code
,
617 (unsigned long)prevCode
);
618 *pErrorCode
=U_PARSE_ERROR
;
621 prevCode
=caseFoldings
[caseFoldingCount
].code
;
623 if(++caseFoldingCount
==MAX_CASE_FOLDING_COUNT
) {
624 fprintf(stderr
, "gencase: too many case folding mappings\n");
625 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
626 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
631 parseCaseFolding(const char *filename
, UErrorCode
*pErrorCode
) {
634 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
638 u_parseDelimitedFile(filename
, ';', fields
, 3, caseFoldingLineFn
, NULL
, pErrorCode
);
641 /* parser for UnicodeData.txt ----------------------------------------------- */
643 /* general categories */
645 genCategoryNames
[U_CHAR_CATEGORY_COUNT
]={
647 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
648 "Mc", "Nd", "Nl", "No",
650 "Cc", "Cf", "Co", "Cs",
651 "Pd", "Ps", "Pe", "Pc", "Po",
652 "Sm", "Sc", "Sk", "So",
656 static int32_t specialCasingIndex
=0, caseFoldingIndex
=0;
658 static void U_CALLCONV
659 unicodeDataLineFn(void *context
,
660 char *fields
[][2], int32_t fieldCount
,
661 UErrorCode
*pErrorCode
) {
664 static UChar32 prevCode
=0;
668 /* reset the properties */
669 uprv_memset(&p
, 0, sizeof(Props
));
671 /* get the character code, field 0 */
672 p
.code
=(UChar32
)uprv_strtoul(fields
[0][0], &end
, 16);
673 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
674 fprintf(stderr
, "gencase: syntax error in field 0 at %s\n", fields
[0][0]);
675 *pErrorCode
=U_PARSE_ERROR
;
679 /* get general category, field 2 */
680 i
=getTokenIndex(genCategoryNames
, U_CHAR_CATEGORY_COUNT
, fields
[2][0]);
684 fprintf(stderr
, "gencase: unknown general category \"%s\" at code 0x%lx\n",
685 fields
[2][0], (unsigned long)p
.code
);
686 *pErrorCode
=U_PARSE_ERROR
;
690 /* get canonical combining class, field 3 */
691 value
=(UChar32
)uprv_strtoul(fields
[3][0], &end
, 10);
692 if(end
<=fields
[3][0] || end
!=fields
[3][1] || value
>0xff) {
693 fprintf(stderr
, "gencase: syntax error in field 3 at %s\n", fields
[0][0]);
694 *pErrorCode
=U_PARSE_ERROR
;
699 /* get uppercase mapping, field 12 */
700 value
=(UChar32
)uprv_strtoul(fields
[12][0], &end
, 16);
701 if(end
!=fields
[12][1]) {
702 fprintf(stderr
, "gencase: syntax error in field 12 at code 0x%lx\n",
703 (unsigned long)p
.code
);
704 *pErrorCode
=U_PARSE_ERROR
;
707 if(value
!=0 && value
!=p
.code
) {
709 uset_add(caseSensitive
, p
.code
);
710 uset_add(caseSensitive
, value
);
713 /* get lowercase value, field 13 */
714 value
=(UChar32
)uprv_strtoul(fields
[13][0], &end
, 16);
715 if(end
!=fields
[13][1]) {
716 fprintf(stderr
, "gencase: syntax error in field 13 at code 0x%lx\n",
717 (unsigned long)p
.code
);
718 *pErrorCode
=U_PARSE_ERROR
;
721 if(value
!=0 && value
!=p
.code
) {
723 uset_add(caseSensitive
, p
.code
);
724 uset_add(caseSensitive
, value
);
727 /* get titlecase value, field 14 */
728 value
=(UChar32
)uprv_strtoul(fields
[14][0], &end
, 16);
729 if(end
!=fields
[14][1]) {
730 fprintf(stderr
, "gencase: syntax error in field 14 at code 0x%lx\n",
731 (unsigned long)p
.code
);
732 *pErrorCode
=U_PARSE_ERROR
;
735 if(value
!=0 && value
!=p
.code
) {
737 uset_add(caseSensitive
, p
.code
);
738 uset_add(caseSensitive
, value
);
741 /* set additional properties from previously parsed files */
742 if(specialCasingIndex
<specialCasingCount
&& p
.code
==specialCasings
[specialCasingIndex
].code
) {
743 p
.specialCasing
=specialCasings
+specialCasingIndex
++;
745 p
.specialCasing
=NULL
;
747 if(caseFoldingIndex
<caseFoldingCount
&& p
.code
==caseFoldings
[caseFoldingIndex
].code
) {
748 p
.caseFolding
=caseFoldings
+caseFoldingIndex
++;
750 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
751 if( p
.caseFolding
->status
=='C' &&
752 p
.caseFolding
->simple
==p
.lowerCase
760 /* check for non-character code points */
761 if((p
.code
&0xfffe)==0xfffe || (uint32_t)(p
.code
-0xfdd0)<0x20) {
762 fprintf(stderr
, "gencase: error - properties for non-character code point U+%04lx\n",
763 (unsigned long)p
.code
);
764 *pErrorCode
=U_PARSE_ERROR
;
768 /* check that the code points (p.code) are in ascending order */
769 if(p
.code
<=prevCode
&& p
.code
>0) {
770 fprintf(stderr
, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
771 (unsigned long)p
.code
, (unsigned long)prevCode
);
772 *pErrorCode
=U_PARSE_ERROR
;
776 /* properties for a single code point */
783 parseDB(const char *filename
, UErrorCode
*pErrorCode
) {
788 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
792 u_parseDelimitedFile(filename
, ';', fields
, 15, unicodeDataLineFn
, NULL
, pErrorCode
);
794 /* are all sub-properties consumed? */
795 if(specialCasingIndex
<specialCasingCount
) {
796 fprintf(stderr
, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
797 *pErrorCode
=U_PARSE_ERROR
;
800 if(caseFoldingIndex
<caseFoldingCount
) {
801 fprintf(stderr
, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
802 *pErrorCode
=U_PARSE_ERROR
;
806 if(U_FAILURE(*pErrorCode
)) {
811 0==uset_getItem(caseSensitive
, i
, &start
, &end
, NULL
, 0, pErrorCode
) && U_SUCCESS(*pErrorCode
);
814 addCaseSensitive(start
, end
);
816 if(*pErrorCode
==U_INDEX_OUTOFBOUNDS_ERROR
) {
817 *pErrorCode
=U_ZERO_ERROR
;
822 * Hey, Emacs, please set the following:
825 * indent-tabs-mode: nil