2 *******************************************************************************
4 * Copyright (C) 2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
16 * This program reads several of the Unicode character database text files,
17 * parses them, and the case mapping properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/uset.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
41 /* data --------------------------------------------------------------------- */
45 UBool beVerbose
=FALSE
, haveCopyright
=TRUE
;
48 * Unicode set collecting the case-sensitive characters;
49 * see uchar.h UCHAR_CASE_SENSITIVE.
50 * Add code points from case mappings/foldings in
51 * the root locale and with default options.
53 static USet
*caseSensitive
;
55 /* prototypes --------------------------------------------------------------- */
58 parseSpecialCasing(const char *filename
, UErrorCode
*pErrorCode
);
61 parseCaseFolding(const char *filename
, UErrorCode
*pErrorCode
);
64 parseDB(const char *filename
, UErrorCode
*pErrorCode
);
66 /* parse files with multiple binary properties ------------------------------ */
68 /* TODO: more common code, move functions to uparse.h|c */
70 /* TODO: similar to genprops/props2.c but not the same */
75 uint32_t vecValue
, vecMask
;
77 typedef struct Binary Binary
;
81 const Binary
*binaries
;
82 int32_t binariesCount
;
84 typedef struct Binaries Binaries
;
88 { "Soft_Dotted", 0, UCASE_SOFT_DOTTED
, UCASE_DOT_MASK
}
93 "PropList", propListNames
, LENGTHOF(propListNames
)
98 { "Lowercase", 0, UCASE_LOWER
, UCASE_TYPE_MASK
},
99 { "Uppercase", 0, UCASE_UPPER
, UCASE_TYPE_MASK
}
102 static const Binaries
103 derCorePropsBinaries
={
104 "DerivedCoreProperties", derCorePropsNames
, LENGTHOF(derCorePropsNames
)
107 static void U_CALLCONV
108 binariesLineFn(void *context
,
109 char *fields
[][2], int32_t fieldCount
,
110 UErrorCode
*pErrorCode
) {
113 uint32_t start
, limit
;
116 bin
=(const Binaries
*)context
;
118 u_parseCodePointRange(fields
[0][0], &start
, &limit
, pErrorCode
);
119 if(U_FAILURE(*pErrorCode
)) {
120 fprintf(stderr
, "gencase: syntax error in %s.txt field 0 at %s\n", bin
->ucdFile
, fields
[0][0]);
125 /* parse binary property name */
126 s
=(char *)u_skipWhitespace(fields
[1][0]);
128 if(i
==bin
->binariesCount
) {
129 /* ignore unrecognized properties */
132 if(isToken(bin
->binaries
[i
].propName
, s
)) {
137 if(bin
->binaries
[i
].vecMask
==0) {
138 fprintf(stderr
, "gencase error: mask value %d==0 for %s %s\n",
139 (int)bin
->binaries
[i
].vecMask
, bin
->ucdFile
, bin
->binaries
[i
].propName
);
140 exit(U_INTERNAL_PROGRAM_ERROR
);
143 if(!upvec_setValue(pv
, start
, limit
, bin
->binaries
[i
].vecWord
, bin
->binaries
[i
].vecValue
, bin
->binaries
[i
].vecMask
, pErrorCode
)) {
144 fprintf(stderr
, "gencase error: unable to set %s, code: %s\n",
145 bin
->binaries
[i
].propName
, u_errorName(*pErrorCode
));
151 parseBinariesFile(char *filename
, char *basename
, const char *suffix
,
153 UErrorCode
*pErrorCode
) {
156 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
160 writeUCDFilename(basename
, bin
->ucdFile
, suffix
);
162 u_parseDelimitedFile(filename
, ';', fields
, 2, binariesLineFn
, (void *)bin
, pErrorCode
);
163 if(U_FAILURE(*pErrorCode
)) {
164 fprintf(stderr
, "error parsing %s.txt: %s\n", bin
->ucdFile
, u_errorName(*pErrorCode
));
168 /* -------------------------------------------------------------------------- */
182 /* Keep these values in sync with the above enums */
183 static UOption options
[]={
185 UOPTION_HELP_QUESTION_MARK
,
190 { "unicode", NULL
, NULL
, NULL
, 'u', UOPT_REQUIRES_ARG
, 0 },
195 main(int argc
, char* argv
[]) {
197 const char *srcDir
=NULL
, *destDir
=NULL
, *suffix
=NULL
;
199 UErrorCode errorCode
=U_ZERO_ERROR
;
201 U_MAIN_INIT_ARGS(argc
, argv
);
203 /* preset then read command line options */
204 options
[DESTDIR
].value
=u_getDataDirectory();
205 options
[SOURCEDIR
].value
="";
206 options
[UNICODE_VERSION
].value
="";
207 options
[ICUDATADIR
].value
=u_getDataDirectory();
208 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
210 /* error handling, printing usage message */
213 "error in command line argument \"%s\"\n",
216 if(argc
<0 || options
[HELP_H
].doesOccur
|| options
[HELP_QUESTION_MARK
].doesOccur
) {
218 * Broken into chucks because the C89 standard says the minimum
219 * required supported string length is 509 bytes.
222 "Usage: %s [-options] [suffix]\n"
224 "read the UnicodeData.txt file and other Unicode properties files and\n"
225 "create a binary file " UCASE_DATA_NAME
"." UCASE_DATA_TYPE
" with the case mapping properties\n"
230 "\t-h or -? or --help this usage text\n"
231 "\t-v or --verbose verbose output\n"
232 "\t-c or --copyright include a copyright notice\n"
233 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
235 "\t-d or --destdir destination directory, followed by the path\n"
236 "\t-s or --sourcedir source directory, followed by the path\n"
237 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
238 "\t followed by path, defaults to %s\n"
239 "\tsuffix suffix that is to be appended with a '-'\n"
240 "\t to the source file basenames before opening;\n"
241 "\t 'gencase new' will read UnicodeData-new.txt etc.\n",
242 u_getDataDirectory());
243 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
246 /* get the options values */
247 beVerbose
=options
[VERBOSE
].doesOccur
;
248 haveCopyright
=options
[COPYRIGHT
].doesOccur
;
249 srcDir
=options
[SOURCEDIR
].value
;
250 destDir
=options
[DESTDIR
].value
;
258 if(options
[UNICODE_VERSION
].doesOccur
) {
259 setUnicodeVersion(options
[UNICODE_VERSION
].value
);
261 /* else use the default dataVersion in store.c */
263 if (options
[ICUDATADIR
].doesOccur
) {
264 u_setDataDirectory(options
[ICUDATADIR
].value
);
267 /* prepare the filename beginning with the source dir */
268 uprv_strcpy(filename
, srcDir
);
269 basename
=filename
+uprv_strlen(filename
);
270 if(basename
>filename
&& *(basename
-1)!=U_FILE_SEP_CHAR
) {
271 *basename
++=U_FILE_SEP_CHAR
;
275 pv
=upvec_open(1, 10000);
276 caseSensitive
=uset_open(1, 0); /* empty set (start>end) */
278 /* process SpecialCasing.txt */
279 writeUCDFilename(basename
, "SpecialCasing", suffix
);
280 parseSpecialCasing(filename
, &errorCode
);
282 /* process CaseFolding.txt */
283 writeUCDFilename(basename
, "CaseFolding", suffix
);
284 parseCaseFolding(filename
, &errorCode
);
286 /* process additional properties files */
289 parseBinariesFile(filename
, basename
, suffix
, &propListBinaries
, &errorCode
);
291 parseBinariesFile(filename
, basename
, suffix
, &derCorePropsBinaries
, &errorCode
);
293 /* process UnicodeData.txt */
294 writeUCDFilename(basename
, "UnicodeData", suffix
);
295 parseDB(filename
, &errorCode
);
297 /* process parsed data */
302 if(U_SUCCESS(errorCode
)) {
303 /* write the properties data file */
304 generateData(destDir
);
312 writeUCDFilename(char *basename
, const char *filename
, const char *suffix
) {
313 int32_t length
=(int32_t)uprv_strlen(filename
);
314 uprv_strcpy(basename
, filename
);
316 basename
[length
++]='-';
317 uprv_strcpy(basename
+length
, suffix
);
318 length
+=(int32_t)uprv_strlen(suffix
);
320 uprv_strcpy(basename
+length
, ".txt");
323 /* TODO: move to toolutil */
325 isToken(const char *token
, const char *s
) {
329 s
=u_skipWhitespace(s
);
336 z
=u_skipWhitespace(s
+j
);
337 if(*z
==';' || *z
==0) {
349 getTokenIndex(const char *const tokens
[], int32_t countTokens
, const char *s
) {
353 s
=u_skipWhitespace(s
);
354 for(i
=0; i
<countTokens
; ++i
) {
363 z
=u_skipWhitespace(s
+j
);
364 if(*z
==';' || *z
==0 || *z
=='#' || *z
=='\r' || *z
=='\n') {
377 _set_addAll(USet
*set
, const UChar
*s
, int32_t length
) {
381 /* needs length>=0 */
382 for(i
=0; i
<length
; /* U16_NEXT advances i */) {
383 U16_NEXT(s
, i
, length
, c
);
388 /* parser for SpecialCasing.txt --------------------------------------------- */
390 #define MAX_SPECIAL_CASING_COUNT 500
392 static SpecialCasing specialCasings
[MAX_SPECIAL_CASING_COUNT
];
393 static int32_t specialCasingCount
=0;
395 static void U_CALLCONV
396 specialCasingLineFn(void *context
,
397 char *fields
[][2], int32_t fieldCount
,
398 UErrorCode
*pErrorCode
) {
402 specialCasings
[specialCasingCount
].code
=(UChar32
)uprv_strtoul(u_skipWhitespace(fields
[0][0]), &end
, 16);
403 end
=(char *)u_skipWhitespace(end
);
404 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
405 fprintf(stderr
, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields
[0][0]);
406 *pErrorCode
=U_PARSE_ERROR
;
410 /* is this a complex mapping? */
411 if(*(end
=(char *)u_skipWhitespace(fields
[4][0]))!=0 && *end
!=';' && *end
!='#') {
412 /* there is some condition text in the fifth field */
413 specialCasings
[specialCasingCount
].isComplex
=TRUE
;
415 /* do not store any actual mappings for this */
416 specialCasings
[specialCasingCount
].lowerCase
[0]=0;
417 specialCasings
[specialCasingCount
].upperCase
[0]=0;
418 specialCasings
[specialCasingCount
].titleCase
[0]=0;
420 /* just set the "complex" flag and get the case mappings */
421 specialCasings
[specialCasingCount
].isComplex
=FALSE
;
422 specialCasings
[specialCasingCount
].lowerCase
[0]=
423 (UChar
)u_parseString(fields
[1][0], specialCasings
[specialCasingCount
].lowerCase
+1, 31, NULL
, pErrorCode
);
424 specialCasings
[specialCasingCount
].upperCase
[0]=
425 (UChar
)u_parseString(fields
[3][0], specialCasings
[specialCasingCount
].upperCase
+1, 31, NULL
, pErrorCode
);
426 specialCasings
[specialCasingCount
].titleCase
[0]=
427 (UChar
)u_parseString(fields
[2][0], specialCasings
[specialCasingCount
].titleCase
+1, 31, NULL
, pErrorCode
);
428 if(U_FAILURE(*pErrorCode
)) {
429 fprintf(stderr
, "gencase: error parsing special casing at %s\n", fields
[0][0]);
433 uset_add(caseSensitive
, (UChar32
)specialCasings
[specialCasingCount
].code
);
434 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].lowerCase
+1, specialCasings
[specialCasingCount
].lowerCase
[0]);
435 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].upperCase
+1, specialCasings
[specialCasingCount
].upperCase
[0]);
436 _set_addAll(caseSensitive
, specialCasings
[specialCasingCount
].titleCase
+1, specialCasings
[specialCasingCount
].titleCase
[0]);
439 if(++specialCasingCount
==MAX_SPECIAL_CASING_COUNT
) {
440 fprintf(stderr
, "gencase: too many special casing mappings\n");
441 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
442 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
446 static int32_t U_CALLCONV
447 compareSpecialCasings(const void *context
, const void *left
, const void *right
) {
448 return ((const SpecialCasing
*)left
)->code
-((const SpecialCasing
*)right
)->code
;
452 parseSpecialCasing(const char *filename
, UErrorCode
*pErrorCode
) {
456 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
460 u_parseDelimitedFile(filename
, ';', fields
, 5, specialCasingLineFn
, NULL
, pErrorCode
);
462 /* sort the special casing entries by code point */
463 if(specialCasingCount
>0) {
464 uprv_sortArray(specialCasings
, specialCasingCount
, sizeof(SpecialCasing
),
465 compareSpecialCasings
, NULL
, FALSE
, pErrorCode
);
467 if(U_FAILURE(*pErrorCode
)) {
471 /* replace multiple entries for any code point by one "complex" one */
473 for(i
=1; i
<specialCasingCount
; ++i
) {
474 if(specialCasings
[i
-1].code
==specialCasings
[i
].code
) {
475 /* there is a duplicate code point */
476 specialCasings
[i
-1].code
=0x7fffffff; /* remove this entry in the following sorting */
477 specialCasings
[i
].isComplex
=TRUE
; /* make the following one complex */
478 specialCasings
[i
].lowerCase
[0]=0;
479 specialCasings
[i
].upperCase
[0]=0;
480 specialCasings
[i
].titleCase
[0]=0;
485 /* if some entries just were removed, then re-sort */
487 uprv_sortArray(specialCasings
, specialCasingCount
, sizeof(SpecialCasing
),
488 compareSpecialCasings
, NULL
, FALSE
, pErrorCode
);
489 specialCasingCount
-=j
;
491 if(U_FAILURE(*pErrorCode
)) {
496 * Add one complex mapping to caseSensitive that was filtered out above:
497 * Greek final Sigma has a conditional mapping but not locale-sensitive,
498 * and it is taken when lowercasing just U+03A3 alone.
499 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
501 uset_add(caseSensitive
, 0x3c2);
504 /* parser for CaseFolding.txt ----------------------------------------------- */
506 #define MAX_CASE_FOLDING_COUNT 2000
508 static CaseFolding caseFoldings
[MAX_CASE_FOLDING_COUNT
];
509 static int32_t caseFoldingCount
=0;
511 static void U_CALLCONV
512 caseFoldingLineFn(void *context
,
513 char *fields
[][2], int32_t fieldCount
,
514 UErrorCode
*pErrorCode
) {
516 static UChar32 prevCode
=0;
521 caseFoldings
[caseFoldingCount
].code
=(UChar32
)uprv_strtoul(u_skipWhitespace(fields
[0][0]), &end
, 16);
522 end
=(char *)u_skipWhitespace(end
);
523 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
524 fprintf(stderr
, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields
[0][0]);
525 *pErrorCode
=U_PARSE_ERROR
;
529 /* get the status of this mapping */
530 caseFoldings
[caseFoldingCount
].status
=status
=*u_skipWhitespace(fields
[1][0]);
531 if(status
!='L' && status
!='E' && status
!='C' && status
!='S' && status
!='F' && status
!='I' && status
!='T') {
532 fprintf(stderr
, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields
[0][0]);
533 *pErrorCode
=U_PARSE_ERROR
;
537 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
542 /* get the mapping */
543 count
=caseFoldings
[caseFoldingCount
].full
[0]=
544 (UChar
)u_parseString(fields
[2][0], caseFoldings
[caseFoldingCount
].full
+1, 31, (uint32_t *)&caseFoldings
[caseFoldingCount
].simple
, pErrorCode
);
545 if(U_FAILURE(*pErrorCode
)) {
546 fprintf(stderr
, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields
[0][0]);
550 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
551 if(count
==0 || count
>2 || (count
==2 && UTF_IS_SINGLE(caseFoldings
[caseFoldingCount
].full
[1]))) {
552 caseFoldings
[caseFoldingCount
].simple
=0;
555 /* update the case-sensitive set */
557 uset_add(caseSensitive
, (UChar32
)caseFoldings
[caseFoldingCount
].code
);
558 _set_addAll(caseSensitive
, caseFoldings
[caseFoldingCount
].full
+1, caseFoldings
[caseFoldingCount
].full
[0]);
561 /* check the status */
563 /* check if there was a full mapping for this code point before */
564 if( caseFoldingCount
>0 &&
565 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
&&
566 caseFoldings
[caseFoldingCount
-1].status
=='F'
568 /* merge the two entries */
569 caseFoldings
[caseFoldingCount
-1].simple
=caseFoldings
[caseFoldingCount
].simple
;
572 } else if(status
=='F') {
573 /* check if there was a simple mapping for this code point before */
574 if( caseFoldingCount
>0 &&
575 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
&&
576 caseFoldings
[caseFoldingCount
-1].status
=='S'
578 /* merge the two entries */
579 uprv_memcpy(caseFoldings
[caseFoldingCount
-1].full
, caseFoldings
[caseFoldingCount
].full
, 32*U_SIZEOF_UCHAR
);
582 } else if(status
=='I' || status
=='T') {
583 /* check if there was a default mapping for this code point before (remove it) */
584 while(caseFoldingCount
>0 &&
585 caseFoldings
[caseFoldingCount
-1].code
==caseFoldings
[caseFoldingCount
].code
590 /* store only a marker for special handling for cases like dotless i */
591 caseFoldings
[caseFoldingCount
].simple
=0;
592 caseFoldings
[caseFoldingCount
].full
[0]=0;
595 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
596 if(caseFoldings
[caseFoldingCount
].code
<=prevCode
&& caseFoldings
[caseFoldingCount
].code
>0) {
597 fprintf(stderr
, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
598 (unsigned long)caseFoldings
[caseFoldingCount
].code
,
599 (unsigned long)prevCode
);
600 *pErrorCode
=U_PARSE_ERROR
;
603 prevCode
=caseFoldings
[caseFoldingCount
].code
;
605 if(++caseFoldingCount
==MAX_CASE_FOLDING_COUNT
) {
606 fprintf(stderr
, "gencase: too many case folding mappings\n");
607 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
608 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
613 parseCaseFolding(const char *filename
, UErrorCode
*pErrorCode
) {
616 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
620 u_parseDelimitedFile(filename
, ';', fields
, 3, caseFoldingLineFn
, NULL
, pErrorCode
);
623 /* parser for UnicodeData.txt ----------------------------------------------- */
625 /* general categories */
627 genCategoryNames
[U_CHAR_CATEGORY_COUNT
]={
629 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
630 "Mc", "Nd", "Nl", "No",
632 "Cc", "Cf", "Co", "Cs",
633 "Pd", "Ps", "Pe", "Pc", "Po",
634 "Sm", "Sc", "Sk", "So",
638 static int32_t specialCasingIndex
=0, caseFoldingIndex
=0;
640 static void U_CALLCONV
641 unicodeDataLineFn(void *context
,
642 char *fields
[][2], int32_t fieldCount
,
643 UErrorCode
*pErrorCode
) {
646 static UChar32 prevCode
=0;
650 /* reset the properties */
651 uprv_memset(&p
, 0, sizeof(Props
));
653 /* get the character code, field 0 */
654 p
.code
=(UChar32
)uprv_strtoul(fields
[0][0], &end
, 16);
655 if(end
<=fields
[0][0] || end
!=fields
[0][1]) {
656 fprintf(stderr
, "gencase: syntax error in field 0 at %s\n", fields
[0][0]);
657 *pErrorCode
=U_PARSE_ERROR
;
661 /* get general category, field 2 */
662 i
=getTokenIndex(genCategoryNames
, U_CHAR_CATEGORY_COUNT
, fields
[2][0]);
666 fprintf(stderr
, "gencase: unknown general category \"%s\" at code 0x%lx\n",
667 fields
[2][0], (unsigned long)p
.code
);
668 *pErrorCode
=U_PARSE_ERROR
;
672 /* get canonical combining class, field 3 */
673 value
=(UChar32
)uprv_strtoul(fields
[3][0], &end
, 10);
674 if(end
<=fields
[3][0] || end
!=fields
[3][1] || value
>0xff) {
675 fprintf(stderr
, "gencase: syntax error in field 3 at %s\n", fields
[0][0]);
676 *pErrorCode
=U_PARSE_ERROR
;
681 /* get uppercase mapping, field 12 */
682 value
=(UChar32
)uprv_strtoul(fields
[12][0], &end
, 16);
683 if(end
!=fields
[12][1]) {
684 fprintf(stderr
, "gencase: syntax error in field 12 at code 0x%lx\n",
685 (unsigned long)p
.code
);
686 *pErrorCode
=U_PARSE_ERROR
;
689 if(value
!=0 && value
!=p
.code
) {
691 uset_add(caseSensitive
, p
.code
);
692 uset_add(caseSensitive
, value
);
695 /* get lowercase value, field 13 */
696 value
=(UChar32
)uprv_strtoul(fields
[13][0], &end
, 16);
697 if(end
!=fields
[13][1]) {
698 fprintf(stderr
, "gencase: syntax error in field 13 at code 0x%lx\n",
699 (unsigned long)p
.code
);
700 *pErrorCode
=U_PARSE_ERROR
;
703 if(value
!=0 && value
!=p
.code
) {
705 uset_add(caseSensitive
, p
.code
);
706 uset_add(caseSensitive
, value
);
709 /* get titlecase value, field 14 */
710 value
=(UChar32
)uprv_strtoul(fields
[14][0], &end
, 16);
711 if(end
!=fields
[14][1]) {
712 fprintf(stderr
, "gencase: syntax error in field 14 at code 0x%lx\n",
713 (unsigned long)p
.code
);
714 *pErrorCode
=U_PARSE_ERROR
;
717 if(value
!=0 && value
!=p
.code
) {
719 uset_add(caseSensitive
, p
.code
);
720 uset_add(caseSensitive
, value
);
723 /* set additional properties from previously parsed files */
724 if(specialCasingIndex
<specialCasingCount
&& p
.code
==specialCasings
[specialCasingIndex
].code
) {
725 p
.specialCasing
=specialCasings
+specialCasingIndex
++;
727 p
.specialCasing
=NULL
;
729 if(caseFoldingIndex
<caseFoldingCount
&& p
.code
==caseFoldings
[caseFoldingIndex
].code
) {
730 p
.caseFolding
=caseFoldings
+caseFoldingIndex
++;
732 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
733 if( p
.caseFolding
->status
=='C' &&
734 p
.caseFolding
->simple
==p
.lowerCase
742 /* check for non-character code points */
743 if((p
.code
&0xfffe)==0xfffe || (uint32_t)(p
.code
-0xfdd0)<0x20) {
744 fprintf(stderr
, "gencase: error - properties for non-character code point U+%04lx\n",
745 (unsigned long)p
.code
);
746 *pErrorCode
=U_PARSE_ERROR
;
750 /* check that the code points (p.code) are in ascending order */
751 if(p
.code
<=prevCode
&& p
.code
>0) {
752 fprintf(stderr
, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
753 (unsigned long)p
.code
, (unsigned long)prevCode
);
754 *pErrorCode
=U_PARSE_ERROR
;
758 /* properties for a single code point */
765 parseDB(const char *filename
, UErrorCode
*pErrorCode
) {
770 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
774 u_parseDelimitedFile(filename
, ';', fields
, 15, unicodeDataLineFn
, NULL
, pErrorCode
);
776 /* are all sub-properties consumed? */
777 if(specialCasingIndex
<specialCasingCount
) {
778 fprintf(stderr
, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
779 *pErrorCode
=U_PARSE_ERROR
;
782 if(caseFoldingIndex
<caseFoldingCount
) {
783 fprintf(stderr
, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
784 *pErrorCode
=U_PARSE_ERROR
;
788 if(U_FAILURE(*pErrorCode
)) {
793 0==uset_getItem(caseSensitive
, i
, &start
, &end
, NULL
, 0, pErrorCode
) && U_SUCCESS(*pErrorCode
);
796 addCaseSensitive(start
, end
);
798 if(*pErrorCode
==U_INDEX_OUTOFBOUNDS_ERROR
) {
799 *pErrorCode
=U_ZERO_ERROR
;
804 * Hey, Emacs, please set the following:
807 * indent-tabs-mode: nil