]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gencase/gencase.c
ICU-6.2.15.tar.gz
[apple/icu.git] / icuSources / tools / gencase / gencase.c
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 2004, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: gencase.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2004aug28
14* created by: Markus W. Scherer
15*
16* This program reads several of the Unicode character database text files,
17* parses them, and the case mapping properties for each character.
18* It then writes a binary file containing the properties
19* that is designed to be used directly for random-access to
20* the properties of each Unicode character.
21*/
22
23#include <stdio.h>
24#include "unicode/utypes.h"
25#include "unicode/uchar.h"
26#include "unicode/uset.h"
27#include "unicode/putil.h"
28#include "unicode/uclean.h"
29#include "cmemory.h"
30#include "cstring.h"
31#include "uarrsort.h"
32#include "unewdata.h"
33#include "uoptions.h"
34#include "uparse.h"
35#include "uprops.h"
36#include "propsvec.h"
37#include "gencase.h"
38
39#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
40
41/* data --------------------------------------------------------------------- */
42
43uint32_t *pv;
44
45UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47/*
48 * Unicode set collecting the case-sensitive characters;
49 * see uchar.h UCHAR_CASE_SENSITIVE.
50 * Add code points from case mappings/foldings in
51 * the root locale and with default options.
52 */
53static USet *caseSensitive;
54
55/* prototypes --------------------------------------------------------------- */
56
57static void
58parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
59
60static void
61parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
62
63static void
64parseDB(const char *filename, UErrorCode *pErrorCode);
65
66/* parse files with multiple binary properties ------------------------------ */
67
68/* TODO: more common code, move functions to uparse.h|c */
69
70/* TODO: similar to genprops/props2.c but not the same */
71
72struct Binary {
73 const char *propName;
74 int32_t vecWord;
75 uint32_t vecValue, vecMask;
76};
77typedef struct Binary Binary;
78
79struct Binaries {
80 const char *ucdFile;
81 const Binary *binaries;
82 int32_t binariesCount;
83};
84typedef struct Binaries Binaries;
85
86static const Binary
87propListNames[]={
88 { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK }
89};
90
91static const Binaries
92propListBinaries={
93 "PropList", propListNames, LENGTHOF(propListNames)
94};
95
96static const Binary
97derCorePropsNames[]={
98 { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK },
99 { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK }
100};
101
102static const Binaries
103derCorePropsBinaries={
104 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
105};
106
107static void U_CALLCONV
108binariesLineFn(void *context,
109 char *fields[][2], int32_t fieldCount,
110 UErrorCode *pErrorCode) {
111 const Binaries *bin;
112 char *s;
113 uint32_t start, limit;
114 int32_t i;
115
116 bin=(const Binaries *)context;
117
118 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
119 if(U_FAILURE(*pErrorCode)) {
120 fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
121 exit(*pErrorCode);
122 }
123 ++limit;
124
125 /* parse binary property name */
126 s=(char *)u_skipWhitespace(fields[1][0]);
127 for(i=0;; ++i) {
128 if(i==bin->binariesCount) {
129 /* ignore unrecognized properties */
130 return;
131 }
132 if(isToken(bin->binaries[i].propName, s)) {
133 break;
134 }
135 }
136
137 if(bin->binaries[i].vecMask==0) {
138 fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
139 (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
140 exit(U_INTERNAL_PROGRAM_ERROR);
141 }
142
143 if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) {
144 fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
145 bin->binaries[i].propName, u_errorName(*pErrorCode));
146 exit(*pErrorCode);
147 }
148}
149
150static void
151parseBinariesFile(char *filename, char *basename, const char *suffix,
152 const Binaries *bin,
153 UErrorCode *pErrorCode) {
154 char *fields[2][2];
155
156 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
157 return;
158 }
159
160 writeUCDFilename(basename, bin->ucdFile, suffix);
161
162 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
163 if(U_FAILURE(*pErrorCode)) {
164 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
165 }
166}
167
168/* -------------------------------------------------------------------------- */
169
170enum
171{
172 HELP_H,
173 HELP_QUESTION_MARK,
174 VERBOSE,
175 COPYRIGHT,
176 DESTDIR,
177 SOURCEDIR,
178 UNICODE_VERSION,
179 ICUDATADIR
180};
181
182/* Keep these values in sync with the above enums */
183static UOption options[]={
184 UOPTION_HELP_H,
185 UOPTION_HELP_QUESTION_MARK,
186 UOPTION_VERBOSE,
187 UOPTION_COPYRIGHT,
188 UOPTION_DESTDIR,
189 UOPTION_SOURCEDIR,
190 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
191 UOPTION_ICUDATADIR
192};
193
194extern int
195main(int argc, char* argv[]) {
196 char filename[300];
197 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
198 char *basename=NULL;
199 UErrorCode errorCode=U_ZERO_ERROR;
200
201 U_MAIN_INIT_ARGS(argc, argv);
202
203 /* preset then read command line options */
204 options[DESTDIR].value=u_getDataDirectory();
205 options[SOURCEDIR].value="";
206 options[UNICODE_VERSION].value="";
207 options[ICUDATADIR].value=u_getDataDirectory();
208 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
209
210 /* error handling, printing usage message */
211 if(argc<0) {
212 fprintf(stderr,
213 "error in command line argument \"%s\"\n",
214 argv[-argc]);
215 }
216 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
217 /*
218 * Broken into chucks because the C89 standard says the minimum
219 * required supported string length is 509 bytes.
220 */
221 fprintf(stderr,
222 "Usage: %s [-options] [suffix]\n"
223 "\n"
224 "read the UnicodeData.txt file and other Unicode properties files and\n"
225 "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
226 "\n",
227 argv[0]);
228 fprintf(stderr,
229 "Options:\n"
230 "\t-h or -? or --help this usage text\n"
231 "\t-v or --verbose verbose output\n"
232 "\t-c or --copyright include a copyright notice\n"
233 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
234 fprintf(stderr,
235 "\t-d or --destdir destination directory, followed by the path\n"
236 "\t-s or --sourcedir source directory, followed by the path\n"
237 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
238 "\t followed by path, defaults to %s\n"
239 "\tsuffix suffix that is to be appended with a '-'\n"
240 "\t to the source file basenames before opening;\n"
241 "\t 'gencase new' will read UnicodeData-new.txt etc.\n",
242 u_getDataDirectory());
243 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
244 }
245
246 /* get the options values */
247 beVerbose=options[VERBOSE].doesOccur;
248 haveCopyright=options[COPYRIGHT].doesOccur;
249 srcDir=options[SOURCEDIR].value;
250 destDir=options[DESTDIR].value;
251
252 if(argc>=2) {
253 suffix=argv[1];
254 } else {
255 suffix=NULL;
256 }
257
258 if(options[UNICODE_VERSION].doesOccur) {
259 setUnicodeVersion(options[UNICODE_VERSION].value);
260 }
261 /* else use the default dataVersion in store.c */
262
263 if (options[ICUDATADIR].doesOccur) {
264 u_setDataDirectory(options[ICUDATADIR].value);
265 }
266
267 /* prepare the filename beginning with the source dir */
268 uprv_strcpy(filename, srcDir);
269 basename=filename+uprv_strlen(filename);
270 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
271 *basename++=U_FILE_SEP_CHAR;
272 }
273
274 /* initialize */
275 pv=upvec_open(1, 10000);
276 caseSensitive=uset_open(1, 0); /* empty set (start>end) */
277
278 /* process SpecialCasing.txt */
279 writeUCDFilename(basename, "SpecialCasing", suffix);
280 parseSpecialCasing(filename, &errorCode);
281
282 /* process CaseFolding.txt */
283 writeUCDFilename(basename, "CaseFolding", suffix);
284 parseCaseFolding(filename, &errorCode);
285
286 /* process additional properties files */
287 *basename=0;
288
289 parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
290
291 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
292
293 /* process UnicodeData.txt */
294 writeUCDFilename(basename, "UnicodeData", suffix);
295 parseDB(filename, &errorCode);
296
297 /* process parsed data */
298 makeCaseClosure();
299
300 makeExceptions();
301
302 if(U_SUCCESS(errorCode)) {
303 /* write the properties data file */
304 generateData(destDir);
305 }
306
307 u_cleanup();
308 return errorCode;
309}
310
311U_CFUNC void
312writeUCDFilename(char *basename, const char *filename, const char *suffix) {
313 int32_t length=(int32_t)uprv_strlen(filename);
314 uprv_strcpy(basename, filename);
315 if(suffix!=NULL) {
316 basename[length++]='-';
317 uprv_strcpy(basename+length, suffix);
318 length+=(int32_t)uprv_strlen(suffix);
319 }
320 uprv_strcpy(basename+length, ".txt");
321}
322
323/* TODO: move to toolutil */
324U_CFUNC UBool
325isToken(const char *token, const char *s) {
326 const char *z;
327 int32_t j;
328
329 s=u_skipWhitespace(s);
330 for(j=0;; ++j) {
331 if(token[j]!=0) {
332 if(s[j]!=token[j]) {
333 break;
334 }
335 } else {
336 z=u_skipWhitespace(s+j);
337 if(*z==';' || *z==0) {
338 return TRUE;
339 } else {
340 break;
341 }
342 }
343 }
344
345 return FALSE;
346}
347
348static int32_t
349getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
350 const char *t, *z;
351 int32_t i, j;
352
353 s=u_skipWhitespace(s);
354 for(i=0; i<countTokens; ++i) {
355 t=tokens[i];
356 if(t!=NULL) {
357 for(j=0;; ++j) {
358 if(t[j]!=0) {
359 if(s[j]!=t[j]) {
360 break;
361 }
362 } else {
363 z=u_skipWhitespace(s+j);
364 if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
365 return i;
366 } else {
367 break;
368 }
369 }
370 }
371 }
372 }
373 return -1;
374}
375
376static void
377_set_addAll(USet *set, const UChar *s, int32_t length) {
378 UChar32 c;
379 int32_t i;
380
381 /* needs length>=0 */
382 for(i=0; i<length; /* U16_NEXT advances i */) {
383 U16_NEXT(s, i, length, c);
384 uset_add(set, c);
385 }
386}
387
388/* parser for SpecialCasing.txt --------------------------------------------- */
389
390#define MAX_SPECIAL_CASING_COUNT 500
391
392static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
393static int32_t specialCasingCount=0;
394
395static void U_CALLCONV
396specialCasingLineFn(void *context,
397 char *fields[][2], int32_t fieldCount,
398 UErrorCode *pErrorCode) {
399 char *end;
400
401 /* get code point */
402 specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
403 end=(char *)u_skipWhitespace(end);
404 if(end<=fields[0][0] || end!=fields[0][1]) {
405 fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
406 *pErrorCode=U_PARSE_ERROR;
407 exit(U_PARSE_ERROR);
408 }
409
410 /* is this a complex mapping? */
411 if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
412 /* there is some condition text in the fifth field */
413 specialCasings[specialCasingCount].isComplex=TRUE;
414
415 /* do not store any actual mappings for this */
416 specialCasings[specialCasingCount].lowerCase[0]=0;
417 specialCasings[specialCasingCount].upperCase[0]=0;
418 specialCasings[specialCasingCount].titleCase[0]=0;
419 } else {
420 /* just set the "complex" flag and get the case mappings */
421 specialCasings[specialCasingCount].isComplex=FALSE;
422 specialCasings[specialCasingCount].lowerCase[0]=
423 (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
424 specialCasings[specialCasingCount].upperCase[0]=
425 (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
426 specialCasings[specialCasingCount].titleCase[0]=
427 (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
428 if(U_FAILURE(*pErrorCode)) {
429 fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
430 exit(*pErrorCode);
431 }
432
433 uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
434 _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
435 _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
436 _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
437 }
438
439 if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
440 fprintf(stderr, "gencase: too many special casing mappings\n");
441 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
442 exit(U_INDEX_OUTOFBOUNDS_ERROR);
443 }
444}
445
446static int32_t U_CALLCONV
447compareSpecialCasings(const void *context, const void *left, const void *right) {
448 return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
449}
450
451static void
452parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
453 char *fields[5][2];
454 int32_t i, j;
455
456 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
457 return;
458 }
459
460 u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
461
462 /* sort the special casing entries by code point */
463 if(specialCasingCount>0) {
464 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
465 compareSpecialCasings, NULL, FALSE, pErrorCode);
466 }
467 if(U_FAILURE(*pErrorCode)) {
468 return;
469 }
470
471 /* replace multiple entries for any code point by one "complex" one */
472 j=0;
473 for(i=1; i<specialCasingCount; ++i) {
474 if(specialCasings[i-1].code==specialCasings[i].code) {
475 /* there is a duplicate code point */
476 specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */
477 specialCasings[i].isComplex=TRUE; /* make the following one complex */
478 specialCasings[i].lowerCase[0]=0;
479 specialCasings[i].upperCase[0]=0;
480 specialCasings[i].titleCase[0]=0;
481 ++j;
482 }
483 }
484
485 /* if some entries just were removed, then re-sort */
486 if(j>0) {
487 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
488 compareSpecialCasings, NULL, FALSE, pErrorCode);
489 specialCasingCount-=j;
490 }
491 if(U_FAILURE(*pErrorCode)) {
492 return;
493 }
494
495 /*
496 * Add one complex mapping to caseSensitive that was filtered out above:
497 * Greek final Sigma has a conditional mapping but not locale-sensitive,
498 * and it is taken when lowercasing just U+03A3 alone.
499 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
500 */
501 uset_add(caseSensitive, 0x3c2);
502}
503
504/* parser for CaseFolding.txt ----------------------------------------------- */
505
506#define MAX_CASE_FOLDING_COUNT 2000
507
508static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
509static int32_t caseFoldingCount=0;
510
511static void U_CALLCONV
512caseFoldingLineFn(void *context,
513 char *fields[][2], int32_t fieldCount,
514 UErrorCode *pErrorCode) {
515 char *end;
516 static UChar32 prevCode=0;
517 int32_t count;
518 char status;
519
520 /* get code point */
521 caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
522 end=(char *)u_skipWhitespace(end);
523 if(end<=fields[0][0] || end!=fields[0][1]) {
524 fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
525 *pErrorCode=U_PARSE_ERROR;
526 exit(U_PARSE_ERROR);
527 }
528
529 /* get the status of this mapping */
530 caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
531 if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
532 fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
533 *pErrorCode=U_PARSE_ERROR;
534 exit(U_PARSE_ERROR);
535 }
536
537 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
538 if(status=='L') {
539 return;
540 }
541
542 /* get the mapping */
543 count=caseFoldings[caseFoldingCount].full[0]=
544 (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
545 if(U_FAILURE(*pErrorCode)) {
546 fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
547 exit(*pErrorCode);
548 }
549
550 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
551 if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
552 caseFoldings[caseFoldingCount].simple=0;
553 }
554
555 /* update the case-sensitive set */
556 if(status!='T') {
557 uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
558 _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
559 }
560
561 /* check the status */
562 if(status=='S') {
563 /* check if there was a full mapping for this code point before */
564 if( caseFoldingCount>0 &&
565 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
566 caseFoldings[caseFoldingCount-1].status=='F'
567 ) {
568 /* merge the two entries */
569 caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
570 return;
571 }
572 } else if(status=='F') {
573 /* check if there was a simple mapping for this code point before */
574 if( caseFoldingCount>0 &&
575 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
576 caseFoldings[caseFoldingCount-1].status=='S'
577 ) {
578 /* merge the two entries */
579 uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
580 return;
581 }
582 } else if(status=='I' || status=='T') {
583 /* check if there was a default mapping for this code point before (remove it) */
584 while(caseFoldingCount>0 &&
585 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
586 ) {
587 prevCode=0;
588 --caseFoldingCount;
589 }
590 /* store only a marker for special handling for cases like dotless i */
591 caseFoldings[caseFoldingCount].simple=0;
592 caseFoldings[caseFoldingCount].full[0]=0;
593 }
594
595 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
596 if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
597 fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
598 (unsigned long)caseFoldings[caseFoldingCount].code,
599 (unsigned long)prevCode);
600 *pErrorCode=U_PARSE_ERROR;
601 exit(U_PARSE_ERROR);
602 }
603 prevCode=caseFoldings[caseFoldingCount].code;
604
605 if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
606 fprintf(stderr, "gencase: too many case folding mappings\n");
607 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
608 exit(U_INDEX_OUTOFBOUNDS_ERROR);
609 }
610}
611
612static void
613parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
614 char *fields[3][2];
615
616 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
617 return;
618 }
619
620 u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
621}
622
623/* parser for UnicodeData.txt ----------------------------------------------- */
624
625/* general categories */
626const char *const
627genCategoryNames[U_CHAR_CATEGORY_COUNT]={
628 "Cn",
629 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
630 "Mc", "Nd", "Nl", "No",
631 "Zs", "Zl", "Zp",
632 "Cc", "Cf", "Co", "Cs",
633 "Pd", "Ps", "Pe", "Pc", "Po",
634 "Sm", "Sc", "Sk", "So",
635 "Pi", "Pf"
636};
637
638static int32_t specialCasingIndex=0, caseFoldingIndex=0;
639
640static void U_CALLCONV
641unicodeDataLineFn(void *context,
642 char *fields[][2], int32_t fieldCount,
643 UErrorCode *pErrorCode) {
644 Props p;
645 char *end;
646 static UChar32 prevCode=0;
647 UChar32 value;
648 int32_t i;
649
650 /* reset the properties */
651 uprv_memset(&p, 0, sizeof(Props));
652
653 /* get the character code, field 0 */
654 p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
655 if(end<=fields[0][0] || end!=fields[0][1]) {
656 fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
657 *pErrorCode=U_PARSE_ERROR;
658 exit(U_PARSE_ERROR);
659 }
660
661 /* get general category, field 2 */
662 i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
663 if(i>=0) {
664 p.gc=(uint8_t)i;
665 } else {
666 fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
667 fields[2][0], (unsigned long)p.code);
668 *pErrorCode=U_PARSE_ERROR;
669 exit(U_PARSE_ERROR);
670 }
671
672 /* get canonical combining class, field 3 */
673 value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
674 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
675 fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
676 *pErrorCode=U_PARSE_ERROR;
677 exit(U_PARSE_ERROR);
678 }
679 p.cc=(uint8_t)value;
680
681 /* get uppercase mapping, field 12 */
682 value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
683 if(end!=fields[12][1]) {
684 fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
685 (unsigned long)p.code);
686 *pErrorCode=U_PARSE_ERROR;
687 exit(U_PARSE_ERROR);
688 }
689 if(value!=0 && value!=p.code) {
690 p.upperCase=value;
691 uset_add(caseSensitive, p.code);
692 uset_add(caseSensitive, value);
693 }
694
695 /* get lowercase value, field 13 */
696 value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
697 if(end!=fields[13][1]) {
698 fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
699 (unsigned long)p.code);
700 *pErrorCode=U_PARSE_ERROR;
701 exit(U_PARSE_ERROR);
702 }
703 if(value!=0 && value!=p.code) {
704 p.lowerCase=value;
705 uset_add(caseSensitive, p.code);
706 uset_add(caseSensitive, value);
707 }
708
709 /* get titlecase value, field 14 */
710 value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
711 if(end!=fields[14][1]) {
712 fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
713 (unsigned long)p.code);
714 *pErrorCode=U_PARSE_ERROR;
715 exit(U_PARSE_ERROR);
716 }
717 if(value!=0 && value!=p.code) {
718 p.titleCase=value;
719 uset_add(caseSensitive, p.code);
720 uset_add(caseSensitive, value);
721 }
722
723 /* set additional properties from previously parsed files */
724 if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
725 p.specialCasing=specialCasings+specialCasingIndex++;
726 } else {
727 p.specialCasing=NULL;
728 }
729 if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
730 p.caseFolding=caseFoldings+caseFoldingIndex++;
731
732 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
733 if( p.caseFolding->status=='C' &&
734 p.caseFolding->simple==p.lowerCase
735 ) {
736 p.caseFolding=NULL;
737 }
738 } else {
739 p.caseFolding=NULL;
740 }
741
742 /* check for non-character code points */
743 if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
744 fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
745 (unsigned long)p.code);
746 *pErrorCode=U_PARSE_ERROR;
747 exit(U_PARSE_ERROR);
748 }
749
750 /* check that the code points (p.code) are in ascending order */
751 if(p.code<=prevCode && p.code>0) {
752 fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
753 (unsigned long)p.code, (unsigned long)prevCode);
754 *pErrorCode=U_PARSE_ERROR;
755 exit(U_PARSE_ERROR);
756 }
757
758 /* properties for a single code point */
759 setProps(&p);
760
761 prevCode=p.code;
762}
763
764static void
765parseDB(const char *filename, UErrorCode *pErrorCode) {
766 char *fields[15][2];
767 UChar32 start, end;
768 int32_t i;
769
770 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
771 return;
772 }
773
774 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
775
776 /* are all sub-properties consumed? */
777 if(specialCasingIndex<specialCasingCount) {
778 fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
779 *pErrorCode=U_PARSE_ERROR;
780 exit(U_PARSE_ERROR);
781 }
782 if(caseFoldingIndex<caseFoldingCount) {
783 fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
784 *pErrorCode=U_PARSE_ERROR;
785 exit(U_PARSE_ERROR);
786 }
787
788 if(U_FAILURE(*pErrorCode)) {
789 return;
790 }
791
792 for(i=0;
793 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
794 ++i
795 ) {
796 addCaseSensitive(start, end);
797 }
798 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
799 *pErrorCode=U_ZERO_ERROR;
800 }
801}
802
803/*
804 * Hey, Emacs, please set the following:
805 *
806 * Local Variables:
807 * indent-tabs-mode: nil
808 * End:
809 *
810 */