]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genprops/genprops.c
ICU-6.2.10.tar.gz
[apple/icu.git] / icuSources / tools / genprops / genprops.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: genprops.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 1999dec08
14 * created by: Markus W. Scherer
15 *
16 * This program reads several of the Unicode character database text files,
17 * parses them, and extracts most of the properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
21 */
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include "unicode/utypes.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uset.h"
28 #include "unicode/putil.h"
29 #include "unicode/uclean.h"
30 #include "cmemory.h"
31 #include "cstring.h"
32 #include "unewdata.h"
33 #include "uoptions.h"
34 #include "uparse.h"
35 #include "uprops.h"
36 #include "propsvec.h"
37
38 U_CDECL_BEGIN
39 #include "genprops.h"
40 U_CDECL_END
41
42 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
43
44 UBool beVerbose=FALSE, haveCopyright=TRUE;
45
46 /*
47 * Unicode set collecting the case-sensitive characters;
48 * see uchar.h UCHAR_CASE_SENSITIVE.
49 * Add code points from case mappings/foldings in
50 * the root locale and with default options.
51 */
52 static USet *caseSensitive;
53
54 /* prototypes --------------------------------------------------------------- */
55
56 static void
57 parseBidiMirroring(const char *filename, UErrorCode *pErrorCode);
58
59 static void
60 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
61
62 static void
63 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
64
65 static void
66 parseDB(const char *filename, UErrorCode *pErrorCode);
67
68 /* -------------------------------------------------------------------------- */
69
70
71 enum
72 {
73 HELP_H,
74 HELP_QUESTION_MARK,
75 VERBOSE,
76 COPYRIGHT,
77 DESTDIR,
78 SOURCEDIR,
79 UNICODE_VERSION,
80 ICUDATADIR
81 };
82
83 /* Keep these values in sync with the above enums */
84 static UOption options[]={
85 UOPTION_HELP_H,
86 UOPTION_HELP_QUESTION_MARK,
87 UOPTION_VERBOSE,
88 UOPTION_COPYRIGHT,
89 UOPTION_DESTDIR,
90 UOPTION_SOURCEDIR,
91 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
92 UOPTION_ICUDATADIR
93 };
94
95 extern int
96 main(int argc, char* argv[]) {
97 char filename[300];
98 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
99 char *basename=NULL;
100 UErrorCode errorCode=U_ZERO_ERROR;
101
102 U_MAIN_INIT_ARGS(argc, argv);
103
104 /* preset then read command line options */
105 options[DESTDIR].value=u_getDataDirectory();
106 options[SOURCEDIR].value="";
107 options[UNICODE_VERSION].value="";
108 options[ICUDATADIR].value=u_getDataDirectory();
109 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
110
111 /* error handling, printing usage message */
112 if(argc<0) {
113 fprintf(stderr,
114 "error in command line argument \"%s\"\n",
115 argv[-argc]);
116 }
117 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
118 /*
119 * Broken into chucks because the C89 standard says the minimum
120 * required supported string length is 509 bytes.
121 */
122 fprintf(stderr,
123 "Usage: %s [-options] [suffix]\n"
124 "\n"
125 "read the UnicodeData.txt file and other Unicode properties files and\n"
126 "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
127 "\n",
128 argv[0]);
129 fprintf(stderr,
130 "Options:\n"
131 "\t-h or -? or --help this usage text\n"
132 "\t-v or --verbose verbose output\n"
133 "\t-c or --copyright include a copyright notice\n"
134 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
135 fprintf(stderr,
136 "\t-d or --destdir destination directory, followed by the path\n"
137 "\t-s or --sourcedir source directory, followed by the path\n"
138 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
139 "\t followed by path, defaults to %s\n"
140 "\tsuffix suffix that is to be appended with a '-'\n"
141 "\t to the source file basenames before opening;\n"
142 "\t 'genprops new' will read UnicodeData-new.txt etc.\n",
143 u_getDataDirectory());
144 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
145 }
146
147 /* get the options values */
148 beVerbose=options[VERBOSE].doesOccur;
149 haveCopyright=options[COPYRIGHT].doesOccur;
150 srcDir=options[SOURCEDIR].value;
151 destDir=options[DESTDIR].value;
152
153 if(argc>=2) {
154 suffix=argv[1];
155 } else {
156 suffix=NULL;
157 }
158
159 if(options[UNICODE_VERSION].doesOccur) {
160 setUnicodeVersion(options[UNICODE_VERSION].value);
161 }
162 /* else use the default dataVersion in store.c */
163
164 if (options[ICUDATADIR].doesOccur) {
165 u_setDataDirectory(options[ICUDATADIR].value);
166 }
167
168 /* prepare the filename beginning with the source dir */
169 uprv_strcpy(filename, srcDir);
170 basename=filename+uprv_strlen(filename);
171 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
172 *basename++=U_FILE_SEP_CHAR;
173 }
174
175 /* initialize */
176 initStore();
177 caseSensitive=uset_open(1, 0); /* empty set (start>end) */
178
179 /* process BidiMirroring.txt */
180 writeUCDFilename(basename, "BidiMirroring", suffix);
181 parseBidiMirroring(filename, &errorCode);
182
183 /* process SpecialCasing.txt */
184 writeUCDFilename(basename, "SpecialCasing", suffix);
185 parseSpecialCasing(filename, &errorCode);
186
187 /* process CaseFolding.txt */
188 writeUCDFilename(basename, "CaseFolding", suffix);
189 parseCaseFolding(filename, &errorCode);
190
191 /* process UnicodeData.txt */
192 writeUCDFilename(basename, "UnicodeData", suffix);
193 parseDB(filename, &errorCode);
194
195 /* process additional properties files */
196 *basename=0;
197 generateAdditionalProperties(filename, suffix, &errorCode);
198
199 /* process parsed data */
200 if(U_SUCCESS(errorCode)) {
201 /* write the properties data file */
202 generateData(destDir);
203 }
204
205 u_cleanup();
206 return errorCode;
207 }
208
209 U_CFUNC void
210 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
211 int32_t length=(int32_t)uprv_strlen(filename);
212 uprv_strcpy(basename, filename);
213 if(suffix!=NULL) {
214 basename[length++]='-';
215 uprv_strcpy(basename+length, suffix);
216 length+=(int32_t)uprv_strlen(suffix);
217 }
218 uprv_strcpy(basename+length, ".txt");
219 }
220
221 U_CFUNC UBool
222 isToken(const char *token, const char *s) {
223 const char *z;
224 int32_t j;
225
226 s=u_skipWhitespace(s);
227 for(j=0;; ++j) {
228 if(token[j]!=0) {
229 if(s[j]!=token[j]) {
230 break;
231 }
232 } else {
233 z=u_skipWhitespace(s+j);
234 if(*z==';' || *z==0) {
235 return TRUE;
236 } else {
237 break;
238 }
239 }
240 }
241
242 return FALSE;
243 }
244
245 U_CFUNC int32_t
246 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
247 const char *t, *z;
248 int32_t i, j;
249
250 s=u_skipWhitespace(s);
251 for(i=0; i<countTokens; ++i) {
252 t=tokens[i];
253 if(t!=NULL) {
254 for(j=0;; ++j) {
255 if(t[j]!=0) {
256 if(s[j]!=t[j]) {
257 break;
258 }
259 } else {
260 z=u_skipWhitespace(s+j);
261 if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
262 return i;
263 } else {
264 break;
265 }
266 }
267 }
268 }
269 }
270 return -1;
271 }
272
273 static void
274 _set_addAll(USet *set, const UChar *s, int32_t length) {
275 UChar32 c;
276 int32_t i;
277
278 /* needs length>=0 */
279 for(i=0; i<length; /* U16_NEXT advances i */) {
280 U16_NEXT(s, i, length, c);
281 uset_add(set, c);
282 }
283 }
284
285 /* parser for BidiMirroring.txt --------------------------------------------- */
286
287 #define MAX_MIRROR_COUNT 2000
288
289 static uint32_t mirrorMappings[MAX_MIRROR_COUNT][2];
290 static int32_t mirrorCount=0;
291
292 static void U_CALLCONV
293 mirrorLineFn(void *context,
294 char *fields[][2], int32_t fieldCount,
295 UErrorCode *pErrorCode) {
296 char *end;
297 static uint32_t prevCode=0;
298
299 mirrorMappings[mirrorCount][0]=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
300 if(end<=fields[0][0] || end!=fields[0][1]) {
301 fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 0 at %s\n", fields[0][0]);
302 *pErrorCode=U_PARSE_ERROR;
303 exit(U_PARSE_ERROR);
304 }
305
306 mirrorMappings[mirrorCount][1]=(uint32_t)uprv_strtoul(fields[1][0], &end, 16);
307 if(end<=fields[1][0] || end!=fields[1][1]) {
308 fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 1 at %s\n", fields[1][0]);
309 *pErrorCode=U_PARSE_ERROR;
310 exit(U_PARSE_ERROR);
311 }
312
313 /* check that the code points (mirrorMappings[mirrorCount][0]) are in ascending order */
314 if(mirrorMappings[mirrorCount][0]<=prevCode && mirrorMappings[mirrorCount][0]>0) {
315 fprintf(stderr, "genprops: error - BidiMirroring entries out of order, U+%04lx after U+%04lx\n",
316 (unsigned long)mirrorMappings[mirrorCount][0],
317 (unsigned long)prevCode);
318 *pErrorCode=U_PARSE_ERROR;
319 exit(U_PARSE_ERROR);
320 }
321 prevCode=mirrorMappings[mirrorCount][0];
322
323 if(++mirrorCount==MAX_MIRROR_COUNT) {
324 fprintf(stderr, "genprops: too many mirror mappings\n");
325 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
326 exit(U_INDEX_OUTOFBOUNDS_ERROR);
327 }
328 }
329
330 static void
331 parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) {
332 char *fields[2][2];
333
334 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
335 return;
336 }
337
338 u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode);
339 }
340
341 /* parser for SpecialCasing.txt --------------------------------------------- */
342
343 #define MAX_SPECIAL_CASING_COUNT 500
344
345 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
346 static int32_t specialCasingCount=0;
347
348 static void U_CALLCONV
349 specialCasingLineFn(void *context,
350 char *fields[][2], int32_t fieldCount,
351 UErrorCode *pErrorCode) {
352 char *end;
353
354 /* get code point */
355 specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
356 end=(char *)u_skipWhitespace(end);
357 if(end<=fields[0][0] || end!=fields[0][1]) {
358 fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
359 *pErrorCode=U_PARSE_ERROR;
360 exit(U_PARSE_ERROR);
361 }
362
363 /* is this a complex mapping? */
364 if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
365 /* there is some condition text in the fifth field */
366 specialCasings[specialCasingCount].isComplex=TRUE;
367
368 /* do not store any actual mappings for this */
369 specialCasings[specialCasingCount].lowerCase[0]=0;
370 specialCasings[specialCasingCount].upperCase[0]=0;
371 specialCasings[specialCasingCount].titleCase[0]=0;
372 } else {
373 /* just set the "complex" flag and get the case mappings */
374 specialCasings[specialCasingCount].isComplex=FALSE;
375 specialCasings[specialCasingCount].lowerCase[0]=
376 (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
377 specialCasings[specialCasingCount].upperCase[0]=
378 (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
379 specialCasings[specialCasingCount].titleCase[0]=
380 (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
381 if(U_FAILURE(*pErrorCode)) {
382 fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]);
383 exit(*pErrorCode);
384 }
385
386 uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
387 _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
388 _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
389 _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
390 }
391
392 if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
393 fprintf(stderr, "genprops: too many special casing mappings\n");
394 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
395 exit(U_INDEX_OUTOFBOUNDS_ERROR);
396 }
397 }
398
399 static int
400 compareSpecialCasings(const void *left, const void *right) {
401 return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
402 }
403
404 static void
405 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
406 char *fields[5][2];
407 int32_t i, j;
408
409 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
410 return;
411 }
412
413 u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
414
415 /* sort the special casing entries by code point */
416 if(specialCasingCount>0) {
417 qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
418 }
419
420 /* replace multiple entries for any code point by one "complex" one */
421 j=0;
422 for(i=1; i<specialCasingCount; ++i) {
423 if(specialCasings[i-1].code==specialCasings[i].code) {
424 /* there is a duplicate code point */
425 specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following qsort */
426 specialCasings[i].isComplex=TRUE; /* make the following one complex */
427 specialCasings[i].lowerCase[0]=0;
428 specialCasings[i].upperCase[0]=0;
429 specialCasings[i].titleCase[0]=0;
430 ++j;
431 }
432 }
433
434 /* if some entries just were removed, then re-sort */
435 if(j>0) {
436 qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
437 specialCasingCount-=j;
438 }
439
440 /*
441 * Add one complex mapping to caseSensitive that was filtered out above:
442 * Greek final Sigma has a conditional mapping but not locale-sensitive,
443 * and it is taken when lowercasing just U+03A3 alone.
444 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
445 */
446 uset_add(caseSensitive, 0x3c2);
447 }
448
449 /* parser for CaseFolding.txt ----------------------------------------------- */
450
451 #define MAX_CASE_FOLDING_COUNT 2000
452
453 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
454 static int32_t caseFoldingCount=0;
455
456 static void U_CALLCONV
457 caseFoldingLineFn(void *context,
458 char *fields[][2], int32_t fieldCount,
459 UErrorCode *pErrorCode) {
460 char *end;
461 static uint32_t prevCode=0;
462 int32_t count;
463 char status;
464
465 /* get code point */
466 caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
467 end=(char *)u_skipWhitespace(end);
468 if(end<=fields[0][0] || end!=fields[0][1]) {
469 fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
470 *pErrorCode=U_PARSE_ERROR;
471 exit(U_PARSE_ERROR);
472 }
473
474 /* get the status of this mapping */
475 caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
476 if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
477 fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
478 *pErrorCode=U_PARSE_ERROR;
479 exit(U_PARSE_ERROR);
480 }
481
482 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
483 if(status=='L') {
484 return;
485 }
486
487 /* get the mapping */
488 count=caseFoldings[caseFoldingCount].full[0]=
489 (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, &caseFoldings[caseFoldingCount].simple, pErrorCode);
490 if(U_FAILURE(*pErrorCode)) {
491 fprintf(stderr, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
492 exit(*pErrorCode);
493 }
494
495 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
496 if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
497 caseFoldings[caseFoldingCount].simple=0;
498 }
499
500 /* update the case-sensitive set */
501 if(status!='T') {
502 uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
503 _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
504 }
505
506 /* check the status */
507 if(status=='S') {
508 /* check if there was a full mapping for this code point before */
509 if( caseFoldingCount>0 &&
510 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
511 caseFoldings[caseFoldingCount-1].status=='F'
512 ) {
513 /* merge the two entries */
514 caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
515 return;
516 }
517 } else if(status=='F') {
518 /* check if there was a simple mapping for this code point before */
519 if( caseFoldingCount>0 &&
520 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
521 caseFoldings[caseFoldingCount-1].status=='S'
522 ) {
523 /* merge the two entries */
524 uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
525 return;
526 }
527 } else if(status=='I' || status=='T') {
528 /* check if there was a default mapping for this code point before (remove it) */
529 while(caseFoldingCount>0 &&
530 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
531 ) {
532 prevCode=0;
533 --caseFoldingCount;
534 }
535 /* store only a marker for special handling for cases like dotless i */
536 caseFoldings[caseFoldingCount].simple=0;
537 caseFoldings[caseFoldingCount].full[0]=0;
538 }
539
540 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
541 if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
542 fprintf(stderr, "genprops: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
543 (unsigned long)caseFoldings[caseFoldingCount].code,
544 (unsigned long)prevCode);
545 *pErrorCode=U_PARSE_ERROR;
546 exit(U_PARSE_ERROR);
547 }
548 prevCode=caseFoldings[caseFoldingCount].code;
549
550 if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
551 fprintf(stderr, "genprops: too many case folding mappings\n");
552 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
553 exit(U_INDEX_OUTOFBOUNDS_ERROR);
554 }
555 }
556
557 static void
558 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
559 char *fields[3][2];
560
561 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
562 return;
563 }
564
565 u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
566 }
567
568 /* parser for UnicodeData.txt ----------------------------------------------- */
569
570 /* general categories */
571 const char *const
572 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
573 "Cn",
574 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
575 "Mc", "Nd", "Nl", "No",
576 "Zs", "Zl", "Zp",
577 "Cc", "Cf", "Co", "Cs",
578 "Pd", "Ps", "Pe", "Pc", "Po",
579 "Sm", "Sc", "Sk", "So",
580 "Pi", "Pf"
581 };
582
583 const char *const
584 bidiNames[U_CHAR_DIRECTION_COUNT]={
585 "L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S",
586 "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
587 };
588
589 const char *const
590 decompositionTypeNames[U_DT_COUNT]={
591 NULL,
592 NULL,
593 "compat",
594 "circle",
595 "final",
596 "font",
597 "fraction",
598 "initial",
599 "isolated",
600 "medial",
601 "narrow",
602 "noBreak",
603 "small",
604 "square",
605 "sub",
606 "super",
607 "vertical",
608 "wide"
609 };
610
611 static struct {
612 uint32_t first, last, props;
613 char name[80];
614 } unicodeAreas[32];
615
616 static int32_t unicodeAreaIndex=0, mirrorIndex=0, specialCasingIndex=0, caseFoldingIndex=0;
617
618 static void U_CALLCONV
619 unicodeDataLineFn(void *context,
620 char *fields[][2], int32_t fieldCount,
621 UErrorCode *pErrorCode) {
622 Props p;
623 char *end;
624 static uint32_t prevCode=0;
625 uint32_t value;
626 int32_t i;
627
628 /* reset the properties */
629 uprv_memset(&p, 0, sizeof(Props));
630
631 /* get the character code, field 0 */
632 p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
633 if(end<=fields[0][0] || end!=fields[0][1]) {
634 fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
635 *pErrorCode=U_PARSE_ERROR;
636 exit(U_PARSE_ERROR);
637 }
638
639 /* get general category, field 2 */
640 i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
641 if(i>=0) {
642 p.generalCategory=(uint8_t)i;
643 } else {
644 fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
645 fields[2][0], (unsigned long)p.code);
646 *pErrorCode=U_PARSE_ERROR;
647 exit(U_PARSE_ERROR);
648 }
649
650 /* get BiDi category, field 4 */
651 i=getTokenIndex(bidiNames, U_CHAR_DIRECTION_COUNT, fields[4][0]);
652 if(i>=0) {
653 p.bidi=(uint8_t)i;
654 } else {
655 fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
656 fields[4][0], (unsigned long)p.code);
657 *pErrorCode=U_PARSE_ERROR;
658 exit(U_PARSE_ERROR);
659 }
660
661 /* get decomposition type, field 5 */
662 if(fields[5][0]<fields[5][1]) {
663 /* there is some decomposition */
664 if(*fields[5][0]!='<') {
665 /* canonical */
666 i=U_DT_CANONICAL;
667 } else {
668 /* get compatibility type */
669 end=fields[5][0]+1;
670 while(end<fields[5][1] && *end!='>') {
671 ++end;
672 }
673 *end='#';
674 i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
675 if(i<0) {
676 fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
677 fields[5][0], (unsigned long)p.code);
678 *pErrorCode=U_PARSE_ERROR;
679 exit(U_PARSE_ERROR);
680 }
681 }
682 if(!upvec_setValue(pv, p.code, p.code+1, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode)) {
683 fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
684 exit(*pErrorCode);
685 }
686 }
687
688 /* decimal digit value, field 6 */
689 if(fields[6][0]<fields[6][1]) {
690 value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
691 if(end!=fields[6][1] || value>0x7fff) {
692 fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
693 (unsigned long)p.code);
694 *pErrorCode=U_PARSE_ERROR;
695 exit(U_PARSE_ERROR);
696 }
697 p.numericValue=(int32_t)value;
698 p.numericType=1;
699 }
700
701 /* digit value, field 7 */
702 if(fields[7][0]<fields[7][1]) {
703 value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
704 if(end!=fields[7][1] || value>0x7fff) {
705 fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
706 (unsigned long)p.code);
707 *pErrorCode=U_PARSE_ERROR;
708 exit(U_PARSE_ERROR);
709 }
710 if(p.numericType==0) {
711 p.numericValue=(int32_t)value;
712 p.numericType=2;
713 } else if((int32_t)value!=p.numericValue) {
714 fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
715 (unsigned long)p.code);
716 *pErrorCode=U_PARSE_ERROR;
717 exit(U_PARSE_ERROR);
718 }
719 }
720
721 /* numeric value, field 8 */
722 if(fields[8][0]<fields[8][1]) {
723 char *s=fields[8][0];
724 UBool isNegative;
725
726 /* get a possible minus sign */
727 if(*s=='-') {
728 isNegative=TRUE;
729 ++s;
730 } else {
731 isNegative=FALSE;
732 }
733
734 value=(uint32_t)uprv_strtoul(s, &end, 10);
735 if(value>0 && *end=='/') {
736 /* field 8 may contain a fractional value, get the denominator */
737 if(p.numericType>0) {
738 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
739 (unsigned long)p.code);
740 *pErrorCode=U_PARSE_ERROR;
741 exit(U_PARSE_ERROR);
742 }
743
744 p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
745 if(p.denominator==0) {
746 fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
747 (unsigned long)p.code);
748 *pErrorCode=U_PARSE_ERROR;
749 exit(U_PARSE_ERROR);
750 }
751 }
752 if(end!=fields[8][1] || value>0x7fffffff) {
753 fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
754 (unsigned long)p.code);
755 *pErrorCode=U_PARSE_ERROR;
756 exit(U_PARSE_ERROR);
757 }
758
759 if(p.numericType==0) {
760 if(isNegative) {
761 p.numericValue=-(int32_t)value;
762 } else {
763 p.numericValue=(int32_t)value;
764 }
765 p.numericType=3;
766 } else if((int32_t)value!=p.numericValue) {
767 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
768 (unsigned long)p.code);
769 *pErrorCode=U_PARSE_ERROR;
770 exit(U_PARSE_ERROR);
771 }
772 }
773
774 /* get Mirrored flag, field 9 */
775 if(*fields[9][0]=='Y') {
776 p.isMirrored=1;
777 } else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') {
778 fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n",
779 (unsigned long)p.code);
780 *pErrorCode=U_PARSE_ERROR;
781 exit(U_PARSE_ERROR);
782 }
783
784 /* get uppercase mapping, field 12 */
785 value=(uint32_t)uprv_strtoul(fields[12][0], &end, 16);
786 if(end!=fields[12][1]) {
787 fprintf(stderr, "genprops: syntax error in field 12 at code 0x%lx\n",
788 (unsigned long)p.code);
789 *pErrorCode=U_PARSE_ERROR;
790 exit(U_PARSE_ERROR);
791 }
792 if(value!=0 && value!=p.code) {
793 p.upperCase=value;
794 uset_add(caseSensitive, (UChar32)p.code);
795 uset_add(caseSensitive, (UChar32)value);
796 }
797
798 /* get lowercase value, field 13 */
799 value=(uint32_t)uprv_strtoul(fields[13][0], &end, 16);
800 if(end!=fields[13][1]) {
801 fprintf(stderr, "genprops: syntax error in field 13 at code 0x%lx\n",
802 (unsigned long)p.code);
803 *pErrorCode=U_PARSE_ERROR;
804 exit(U_PARSE_ERROR);
805 }
806 if(value!=0 && value!=p.code) {
807 p.lowerCase=value;
808 uset_add(caseSensitive, (UChar32)p.code);
809 uset_add(caseSensitive, (UChar32)value);
810 }
811
812 /* get titlecase value, field 14 */
813 value=(uint32_t)uprv_strtoul(fields[14][0], &end, 16);
814 if(end!=fields[14][1]) {
815 fprintf(stderr, "genprops: syntax error in field 14 at code 0x%lx\n",
816 (unsigned long)p.code);
817 *pErrorCode=U_PARSE_ERROR;
818 exit(U_PARSE_ERROR);
819 }
820 if(value!=0 && value!=p.code) {
821 p.titleCase=value;
822 uset_add(caseSensitive, (UChar32)p.code);
823 uset_add(caseSensitive, (UChar32)value);
824 }
825
826 /* set additional properties from previously parsed files */
827 if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
828 p.mirrorMapping=mirrorMappings[mirrorIndex++][1];
829 }
830 if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
831 p.specialCasing=specialCasings+specialCasingIndex++;
832 } else {
833 p.specialCasing=NULL;
834 }
835 if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
836 p.caseFolding=caseFoldings+caseFoldingIndex++;
837
838 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
839 if( p.caseFolding->status=='C' &&
840 p.caseFolding->simple==p.lowerCase
841 ) {
842 p.caseFolding=NULL;
843 }
844 } else {
845 p.caseFolding=NULL;
846 }
847
848 value=makeProps(&p);
849
850 if(*fields[1][0]=='<') {
851 /* first or last entry of a Unicode area */
852 size_t length=fields[1][1]-fields[1][0];
853
854 if(length<9) {
855 /* name too short for an area name */
856 } else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
857 /* set the current area */
858 if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
859 length-=9;
860 unicodeAreas[unicodeAreaIndex].first=p.code;
861 unicodeAreas[unicodeAreaIndex].props=value;
862 uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
863 unicodeAreas[unicodeAreaIndex].name[length]=0;
864 } else {
865 /* error: a previous area is incomplete */
866 fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
867 *pErrorCode=U_PARSE_ERROR;
868 exit(U_PARSE_ERROR);
869 }
870 return;
871 } else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
872 /* check that the current area matches, and complete it with the last code point */
873 length-=8;
874 if( unicodeAreas[unicodeAreaIndex].props==value &&
875 0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
876 unicodeAreas[unicodeAreaIndex].name[length]==0 &&
877 unicodeAreas[unicodeAreaIndex].first<p.code
878 ) {
879 unicodeAreas[unicodeAreaIndex].last=p.code;
880 if(beVerbose) {
881 printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
882 (unsigned long)unicodeAreas[unicodeAreaIndex].first,
883 (unsigned long)unicodeAreas[unicodeAreaIndex].last,
884 unicodeAreas[unicodeAreaIndex].name);
885 }
886 unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
887 } else {
888 /* error: different properties between first & last, different area name, first>=last */
889 fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
890 *pErrorCode=U_PARSE_ERROR;
891 exit(U_PARSE_ERROR);
892 }
893 return;
894 } else {
895 /* not an area name */
896 }
897 }
898
899 /* check for non-character code points */
900 if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
901 fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
902 (unsigned long)p.code);
903 *pErrorCode=U_PARSE_ERROR;
904 exit(U_PARSE_ERROR);
905 }
906
907 /* check that the code points (p.code) are in ascending order */
908 if(p.code<=prevCode && p.code>0) {
909 fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
910 (unsigned long)p.code, (unsigned long)prevCode);
911 *pErrorCode=U_PARSE_ERROR;
912 exit(U_PARSE_ERROR);
913 }
914 prevCode=p.code;
915
916 /* properties for a single code point */
917 addProps(p.code, value);
918 }
919
920 /* set repeated properties for the areas */
921 static void
922 repeatAreaProps() {
923 uint32_t puaProps;
924 int32_t i;
925 UBool hasPlane15PUA, hasPlane16PUA;
926 UErrorCode errorCode;
927
928 /*
929 * UnicodeData.txt before 3.0.1 did not contain the PUAs on
930 * planes 15 and 16.
931 * If that is the case, then we add them here, using the properties
932 * from the BMP PUA.
933 */
934 puaProps=0;
935 hasPlane15PUA=hasPlane16PUA=FALSE;
936
937 for(i=0; i<unicodeAreaIndex; ++i) {
938 repeatProps(unicodeAreas[i].first,
939 unicodeAreas[i].last,
940 unicodeAreas[i].props);
941 if(unicodeAreas[i].first==0xe000) {
942 puaProps=unicodeAreas[i].props;
943 } else if(unicodeAreas[i].first==0xf0000) {
944 hasPlane15PUA=TRUE;
945 } else if(unicodeAreas[i].first==0x100000) {
946 hasPlane16PUA=TRUE;
947 }
948 }
949
950 if(puaProps!=0) {
951 if(!hasPlane15PUA) {
952 repeatProps(0xf0000, 0xffffd, puaProps);
953 }
954 if(!hasPlane16PUA) {
955 repeatProps(0x100000, 0x10fffd, puaProps);
956 }
957 }
958
959 /* Hangul have canonical decompositions */
960 errorCode=U_ZERO_ERROR;
961 if(!upvec_setValue(pv, 0xac00, 0xd7a4, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode)) {
962 fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
963 exit(errorCode);
964 }
965 }
966
967 static void
968 parseDB(const char *filename, UErrorCode *pErrorCode) {
969 /* default Bidi classes for unassigned code points */
970 static const uint32_t defaultBidi[][2]={ /* { limit, class } */
971 { 0x0590, U_LEFT_TO_RIGHT },
972 { 0x0600, U_RIGHT_TO_LEFT },
973 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
974 { 0xFB1D, U_LEFT_TO_RIGHT },
975 { 0xFB50, U_RIGHT_TO_LEFT },
976 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
977 { 0xFE70, U_LEFT_TO_RIGHT },
978 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
979 { 0x110000, U_LEFT_TO_RIGHT }
980 };
981
982 char *fields[15][2];
983 UChar32 start, end;
984 uint32_t prev;
985 int32_t i;
986
987 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
988 return;
989 }
990
991 /*
992 * Set default Bidi classes for unassigned code points.
993 * See table 3-7 "Bidirectional Character Types" in UAX #9.
994 * http://www.unicode.org/reports/tr9/
995 */
996 prev=0;
997 for(i=0; i<LENGTHOF(defaultBidi); ++i) {
998 if(defaultBidi[i][1]!=0) {
999 repeatProps(prev, defaultBidi[i][0]-1, defaultBidi[i][1]<<UPROPS_BIDI_SHIFT);
1000 }
1001 prev=defaultBidi[i][0];
1002 }
1003
1004 /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
1005 unicodeAreas[0].first=0xffffffff;
1006
1007 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
1008
1009 if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
1010 fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
1011 unicodeAreas[unicodeAreaIndex].name,
1012 (unsigned long)unicodeAreas[unicodeAreaIndex].first);
1013 *pErrorCode=U_PARSE_ERROR;
1014 exit(U_PARSE_ERROR);
1015 }
1016
1017 repeatAreaProps();
1018
1019 /* are all sub-properties consumed? */
1020 if(mirrorIndex<mirrorCount) {
1021 fprintf(stderr, "genprops: error - some code points in BidiMirroring.txt are missing from UnicodeData.txt\n");
1022 *pErrorCode=U_PARSE_ERROR;
1023 exit(U_PARSE_ERROR);
1024 }
1025 if(specialCasingIndex<specialCasingCount) {
1026 fprintf(stderr, "genprops: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
1027 *pErrorCode=U_PARSE_ERROR;
1028 exit(U_PARSE_ERROR);
1029 }
1030 if(caseFoldingIndex<caseFoldingCount) {
1031 fprintf(stderr, "genprops: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
1032 *pErrorCode=U_PARSE_ERROR;
1033 exit(U_PARSE_ERROR);
1034 }
1035
1036 if(U_FAILURE(*pErrorCode)) {
1037 return;
1038 }
1039
1040 for(i=0;
1041 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
1042 ++i
1043 ) {
1044 addCaseSensitive(start, end);
1045 }
1046 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
1047 *pErrorCode=U_ZERO_ERROR;
1048 }
1049 }
1050
1051 /*
1052 * Hey, Emacs, please set the following:
1053 *
1054 * Local Variables:
1055 * indent-tabs-mode: nil
1056 * End:
1057 *
1058 */
1059