]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genprops/props2.c
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / tools / genprops / props2.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
374ca955 4* Copyright (C) 2002-2004, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: props2.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002feb24
14* created by: Markus W. Scherer
15*
16* Parse more Unicode Character Database files and store
17* additional Unicode character properties in bit set vectors.
18*/
19
20#include <stdio.h>
21#include "unicode/utypes.h"
22#include "unicode/uchar.h"
23#include "unicode/uscript.h"
24#include "cstring.h"
25#include "cmemory.h"
26#include "utrie.h"
27#include "uprops.h"
28#include "propsvec.h"
29#include "uparse.h"
30#include "genprops.h"
31
32#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
33
34/* data --------------------------------------------------------------------- */
35
36static UNewTrie *trie;
37uint32_t *pv;
38static int32_t pvCount;
39
40/* miscellaneous ------------------------------------------------------------ */
41
42static char *
43trimTerminateField(char *s, char *limit) {
44 /* trim leading whitespace */
45 s=(char *)u_skipWhitespace(s);
46
47 /* trim trailing whitespace */
48 while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
49 --limit;
50 }
51 *limit=0;
52
53 return s;
54}
55
56static void
57parseTwoFieldFile(char *filename, char *basename,
58 const char *ucdFile, const char *suffix,
59 UParseLineFn *lineFn,
60 UErrorCode *pErrorCode) {
61 char *fields[2][2];
62
63 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
64 return;
65 }
66
67 writeUCDFilename(basename, ucdFile, suffix);
68
69 u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
70 if(U_FAILURE(*pErrorCode)) {
71 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
72 }
73}
74
75static void U_CALLCONV
76ageLineFn(void *context,
77 char *fields[][2], int32_t fieldCount,
78 UErrorCode *pErrorCode);
79
80static void
81parseMultiFieldFile(char *filename, char *basename,
82 const char *ucdFile, const char *suffix,
83 int32_t fieldCount,
84 UParseLineFn *lineFn,
85 UErrorCode *pErrorCode) {
86 char *fields[20][2];
87
88 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
89 return;
90 }
91
92 writeUCDFilename(basename, ucdFile, suffix);
93
94 u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
95 if(U_FAILURE(*pErrorCode)) {
96 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
97 }
98}
99
100static void U_CALLCONV
101numericLineFn(void *context,
102 char *fields[][2], int32_t fieldCount,
103 UErrorCode *pErrorCode);
104
105static void U_CALLCONV
106bidiClassLineFn(void *context,
107 char *fields[][2], int32_t fieldCount,
108 UErrorCode *pErrorCode);
109
110/* parse files with single enumerated properties ---------------------------- */
111
112struct SingleEnum {
113 const char *ucdFile, *propName;
114 UProperty prop;
115 int32_t vecWord, vecShift;
116 uint32_t vecMask;
117};
118typedef struct SingleEnum SingleEnum;
119
120static void
121parseSingleEnumFile(char *filename, char *basename, const char *suffix,
122 const SingleEnum *sen,
123 UErrorCode *pErrorCode);
124
125static const SingleEnum scriptSingleEnum={
126 "Scripts", "script",
127 UCHAR_SCRIPT,
128 0, 0, UPROPS_SCRIPT_MASK
129};
130
131static const SingleEnum blockSingleEnum={
132 "Blocks", "block",
133 UCHAR_BLOCK,
134 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
135};
136
137static const SingleEnum lineBreakSingleEnum={
138 "LineBreak", "line break",
139 UCHAR_LINE_BREAK,
140 0, UPROPS_LB_SHIFT, UPROPS_LB_MASK
141};
142
143static const SingleEnum eawSingleEnum={
144 "EastAsianWidth", "east asian width",
145 UCHAR_EAST_ASIAN_WIDTH,
146 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
147};
148
149static const SingleEnum jtSingleEnum={
150 "DerivedJoiningType", "joining type",
151 UCHAR_JOINING_TYPE,
152 2, UPROPS_JT_SHIFT, UPROPS_JT_MASK
153};
154
155static const SingleEnum jgSingleEnum={
156 "DerivedJoiningGroup", "joining group",
157 UCHAR_JOINING_GROUP,
158 2, UPROPS_JG_SHIFT, UPROPS_JG_MASK
159};
160
161static void U_CALLCONV
162singleEnumLineFn(void *context,
163 char *fields[][2], int32_t fieldCount,
164 UErrorCode *pErrorCode) {
165 const SingleEnum *sen;
166 char *s;
167 uint32_t start, limit, uv;
168 int32_t value;
169
170 sen=(const SingleEnum *)context;
171
172 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
173 if(U_FAILURE(*pErrorCode)) {
174 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
175 exit(*pErrorCode);
176 }
177 ++limit;
178
179 /* parse property alias */
180 s=trimTerminateField(fields[1][0], fields[1][1]);
181 value=u_getPropertyValueEnum(sen->prop, s);
182 if(value<0) {
183 if(sen->prop==UCHAR_BLOCK) {
184 if(isToken("Greek", s)) {
185 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
186 } else if(isToken("Combining Marks for Symbols", s)) {
187 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
188 } else if(isToken("Private Use", s)) {
189 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
190 }
191 }
192 }
193 if(value<0) {
194 fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
195 sen->propName, sen->ucdFile, s);
196 exit(U_PARSE_ERROR);
197 }
198
199 uv=(uint32_t)(value<<sen->vecShift);
200 if((uv&sen->vecMask)!=uv) {
201 fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
374ca955 202 sen->propName, (int)uv, s);
b75a7d8f
A
203 exit(U_INTERNAL_PROGRAM_ERROR);
204 }
205
206 if(!upvec_setValue(pv, start, limit, sen->vecWord, uv, sen->vecMask, pErrorCode)) {
207 fprintf(stderr, "genprops error: unable to set %s code: %s\n",
208 sen->propName, u_errorName(*pErrorCode));
209 exit(*pErrorCode);
210 }
211}
212
213static void
214parseSingleEnumFile(char *filename, char *basename, const char *suffix,
215 const SingleEnum *sen,
216 UErrorCode *pErrorCode) {
217 char *fields[2][2];
218
219 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
220 return;
221 }
222
223 writeUCDFilename(basename, sen->ucdFile, suffix);
224
225 u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
226 if(U_FAILURE(*pErrorCode)) {
227 fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
228 }
229}
230
231/* parse files with multiple binary properties ------------------------------ */
232
233struct Binary {
234 const char *propName;
235 int32_t vecWord, vecShift;
236};
237typedef struct Binary Binary;
238
239struct Binaries {
240 const char *ucdFile;
241 const Binary *binaries;
242 int32_t binariesCount;
243};
244typedef struct Binaries Binaries;
245
246static const Binary
247propListNames[]={
248 { "White_Space", 1, UPROPS_WHITE_SPACE },
249 { "Bidi_Control", 1, UPROPS_BIDI_CONTROL },
250 { "Join_Control", 1, UPROPS_JOIN_CONTROL },
251 { "Dash", 1, UPROPS_DASH },
252 { "Hyphen", 1, UPROPS_HYPHEN },
253 { "Quotation_Mark", 1, UPROPS_QUOTATION_MARK },
254 { "Terminal_Punctuation", 1, UPROPS_TERMINAL_PUNCTUATION },
255 { "Hex_Digit", 1, UPROPS_HEX_DIGIT },
256 { "ASCII_Hex_Digit", 1, UPROPS_ASCII_HEX_DIGIT },
257 { "Ideographic", 1, UPROPS_IDEOGRAPHIC },
258 { "Diacritic", 1, UPROPS_DIACRITIC },
259 { "Extender", 1, UPROPS_EXTENDER },
260 { "Noncharacter_Code_Point", 1, UPROPS_NONCHARACTER_CODE_POINT },
261 { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK },
262 { "IDS_Binary_Operator", 1, UPROPS_IDS_BINARY_OPERATOR },
263 { "IDS_Trinary_Operator", 1, UPROPS_IDS_TRINARY_OPERATOR },
264 { "Radical", 1, UPROPS_RADICAL },
265 { "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH },
266 { "Deprecated", 1, UPROPS_DEPRECATED },
267 { "Soft_Dotted", 1, UPROPS_SOFT_DOTTED },
374ca955
A
268 { "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
269
270 /* new properties in Unicode 4.0.1 */
271 { "STerm", 2, UPROPS_V2_S_TERM },
272 { "Variation_Selector", 2, UPROPS_V2_VARIATION_SELECTOR }
b75a7d8f
A
273};
274
275static const Binaries
276propListBinaries={
277 "PropList", propListNames, LENGTHOF(propListNames)
278};
279
280static const Binary
281derCorePropsNames[]={
282 { "XID_Start", 1, UPROPS_XID_START },
283 { "XID_Continue", 1, UPROPS_XID_CONTINUE },
284
285 /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
286 { "Math", 1, UPROPS_MATH },
287 { "Alphabetic", 1, UPROPS_ALPHABETIC },
288 { "Lowercase", 1, UPROPS_LOWERCASE },
289 { "Uppercase", 1, UPROPS_UPPERCASE },
290 { "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND },
291 { "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
292
293 /* new properties bits in ICU 2.6/format version 3.2 */
294 { "ID_Start", 1, UPROPS_ID_START },
295 { "ID_Continue", 1, UPROPS_ID_CONTINUE },
296 { "Grapheme_Base", 1, UPROPS_GRAPHEME_BASE }
297};
298
299static const Binaries
300derCorePropsBinaries={
301 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
302};
303
304static char ignoredProps[100][64];
305static int32_t ignoredPropsCount;
306
307static void
308addIgnoredProp(char *s, char *limit) {
309 int32_t i;
310
311 s=trimTerminateField(s, limit);
312 for(i=0; i<ignoredPropsCount; ++i) {
313 if(0==uprv_strcmp(ignoredProps[i], s)) {
314 return;
315 }
316 }
317 uprv_strcpy(ignoredProps[ignoredPropsCount++], s);
318}
319
320static void U_CALLCONV
321binariesLineFn(void *context,
322 char *fields[][2], int32_t fieldCount,
323 UErrorCode *pErrorCode) {
324 const Binaries *bin;
325 char *s;
326 uint32_t start, limit, uv;
327 int32_t i;
328
329 bin=(const Binaries *)context;
330
331 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
332 if(U_FAILURE(*pErrorCode)) {
333 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
334 exit(*pErrorCode);
335 }
336 ++limit;
337
338 /* parse binary property name */
339 s=(char *)u_skipWhitespace(fields[1][0]);
340 for(i=0;; ++i) {
341 if(i==bin->binariesCount) {
342 /* ignore unrecognized properties */
343 addIgnoredProp(s, fields[1][1]);
344 return;
345 }
346 if(isToken(bin->binaries[i].propName, s)) {
347 break;
348 }
349 }
350
351 if(bin->binaries[i].vecShift>=32) {
352 fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
374ca955 353 (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
b75a7d8f
A
354 exit(U_INTERNAL_PROGRAM_ERROR);
355 }
356 uv=U_MASK(bin->binaries[i].vecShift);
357
358 if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, uv, uv, pErrorCode)) {
359 fprintf(stderr, "genprops error: unable to set %s code: %s\n",
360 bin->binaries[i].propName, u_errorName(*pErrorCode));
361 exit(*pErrorCode);
362 }
363}
364
365static void
366parseBinariesFile(char *filename, char *basename, const char *suffix,
367 const Binaries *bin,
368 UErrorCode *pErrorCode) {
369 char *fields[2][2];
370 int32_t i;
371
372 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
373 return;
374 }
375
376 writeUCDFilename(basename, bin->ucdFile, suffix);
377
378 ignoredPropsCount=0;
379
380 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
381 if(U_FAILURE(*pErrorCode)) {
382 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
383 }
384
385 for(i=0; i<ignoredPropsCount; ++i) {
386 printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
387 }
388}
389
390/* -------------------------------------------------------------------------- */
391
392U_CFUNC void
393initAdditionalProperties() {
394 pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
395}
396
397U_CFUNC void
398generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
399 char *basename;
400
401 basename=filename+uprv_strlen(filename);
402
403 /* process various UCD .txt files */
404
405 /* add Han numeric types & values */
374ca955 406 parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
b75a7d8f
A
407
408 /* set proper bidi class for unassigned code points (Cn) */
409 parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, pErrorCode);
410
411 parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
412
413 /*
414 * UTR 24 says:
415 * Section 2:
416 * "Common - For characters that may be used
417 * within multiple scripts,
418 * or any unassigned code points."
419 *
420 * Section 4:
421 * "The value COMMON is the default value,
422 * given to all code points that are not
423 * explicitly mentioned in the data file."
424 *
425 * COMMON==USCRIPT_COMMON==0 - nothing to do
426 */
427 parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
428
429 parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
430
431 parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
432
433 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);
434
435 /*
436 * LineBreak-4.0.0.txt:
437 * - All code points, assigned and unassigned, that are not listed
438 * explicitly are given the value "XX".
439 *
440 * XX==U_LB_UNKNOWN==0 - nothing to do
441 */
442 parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
443
444 parseSingleEnumFile(filename, basename, suffix, &jtSingleEnum, pErrorCode);
445
446 parseSingleEnumFile(filename, basename, suffix, &jgSingleEnum, pErrorCode);
447
448 /*
449 * Preset East Asian Width defaults:
450 *
451 * http://www.unicode.org/reports/tr11/#Unassigned
452 * 7.1 Unassigned and Private Use characters
453 *
454 * All unassigned characters are by default classified as non-East Asian neutral,
455 * except for the range U+20000 to U+2FFFD,
456 * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
457 * All Private use characters are by default classified as ambiguous,
458 * since their definition depends on context.
459 *
460 * N for all ==0 - nothing to do
461 * A for Private Use
462 * W for plane 2
463 */
464 *pErrorCode=U_ZERO_ERROR;
465 if( !upvec_setValue(pv, 0xe000, 0xf900, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
466 !upvec_setValue(pv, 0xf0000, 0xffffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
467 !upvec_setValue(pv, 0x100000, 0x10fffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
468 !upvec_setValue(pv, 0x20000, 0x2fffe, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)
469 ) {
470 fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
471 exit(*pErrorCode);
472 }
473
474 /* parse EastAsianWidth.txt */
475 parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
476
374ca955 477 trie=utrie_open(NULL, NULL, 50000, 0, 0, TRUE);
b75a7d8f
A
478 if(trie==NULL) {
479 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
480 upvec_close(pv);
481 return;
482 }
483
484 pvCount=upvec_toTrie(pv, trie, pErrorCode);
485 if(U_FAILURE(*pErrorCode)) {
486 fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
487 exit(*pErrorCode);
488 }
489}
490
491/* DerivedAge.txt ----------------------------------------------------------- */
492
493static void U_CALLCONV
494ageLineFn(void *context,
495 char *fields[][2], int32_t fieldCount,
496 UErrorCode *pErrorCode) {
497 char *s, *end;
498 uint32_t value, start, limit, version;
499
500 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
501 if(U_FAILURE(*pErrorCode)) {
502 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
503 exit(*pErrorCode);
504 }
505 ++limit;
506
507 /* parse version number */
508 s=(char *)u_skipWhitespace(fields[1][0]);
509 value=(uint32_t)uprv_strtoul(s, &end, 10);
510 if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
511 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
512 *pErrorCode=U_PARSE_ERROR;
513 exit(U_PARSE_ERROR);
514 }
515 version=value<<4;
516
517 /* parse minor version number */
518 if(*end=='.') {
519 s=(char *)u_skipWhitespace(end+1);
520 value=(uint32_t)uprv_strtoul(s, &end, 10);
521 if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) {
522 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
523 *pErrorCode=U_PARSE_ERROR;
524 exit(U_PARSE_ERROR);
525 }
526 version|=value;
527 }
528
529 if(!upvec_setValue(pv, start, limit, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode)) {
530 fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
531 exit(*pErrorCode);
532 }
533}
534
535/* DerivedNumericValues.txt ------------------------------------------------- */
536
537static void U_CALLCONV
538numericLineFn(void *context,
539 char *fields[][2], int32_t fieldCount,
540 UErrorCode *pErrorCode) {
541 Props newProps;
542 char *s, *end;
543 uint32_t start, limit, value, oldProps32;
374ca955 544 int32_t oldType;
b75a7d8f
A
545 char c;
546 UBool isFraction;
547
548 /* get the code point range */
549 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
550 if(U_FAILURE(*pErrorCode)) {
551 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
552 exit(*pErrorCode);
553 }
554 ++limit;
555
556 /* check if the numeric value is a fraction (this code does not handle any) */
557 isFraction=FALSE;
558 s=uprv_strchr(fields[1][0], '.');
559 if(s!=NULL) {
560 end=s+1;
561 while('0'<=(c=*end++) && c<='9') {
562 if(c!='0') {
563 isFraction=TRUE;
564 break;
565 }
566 }
567 }
568
569 if(isFraction) {
570 value=0;
571 } else {
572 /* parse numeric value */
573 s=(char *)u_skipWhitespace(fields[1][0]);
574
575 /* try large powers of 10 first, may otherwise overflow strtoul() */
576 if(0==uprv_strncmp(s, "10000000000", 11)) {
577 /* large powers of 10 are encoded in a special way, see store.c */
578 value=0x7fffff00;
579 end=s;
580 while(*(++end)=='0') {
581 ++value;
582 }
583 } else {
584 /* normal number parsing */
585 value=(uint32_t)uprv_strtoul(s, &end, 10);
586 }
587 if(end<=s || (*end!='.' && u_skipWhitespace(end)!=fields[1][1]) || value>=0x80000000) {
588 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
589 exit(U_PARSE_ERROR);
590 }
591 }
592
374ca955
A
593 /*
594 * Unicode 4.0.1 removes the third column that used to list the numeric type.
595 * Assume that either the data is the same as in UnicodeData.txt,
596 * or else that the numeric type is "numeric".
597 * This should work because we only expect to add numeric values for
598 * Han characters; for those, UnicodeData.txt lists only ranges without
599 * specific properties for single characters.
600 */
b75a7d8f
A
601
602 for(; start<limit; ++start) {
603 oldProps32=getProps(start);
604 oldType=(int32_t)GET_NUMERIC_TYPE(oldProps32);
374ca955 605 if(oldType!=0) {
b75a7d8f
A
606 /* this code point was already listed with its numeric value in UnicodeData.txt */
607 continue;
608 }
b75a7d8f
A
609
610 /*
611 * Do not set a numeric value for code points that have other
612 * values or exceptions because the code below is not prepared
613 * to maintain such values and exceptions.
614 *
615 * Check store.c (e.g., file format description and makeProps())
616 * for details of what code points get their value field interpreted.
617 * For example, case mappings for Ll/Lt/Lu and mirror mappings for mirrored characters.
618 *
619 * For simplicity, and because we only expect to set numeric values for Han characters,
620 * for now we only allow to set these values for Lo characters.
621 */
622 if(GET_UNSIGNED_VALUE(oldProps32)!=0 || PROPS_VALUE_IS_EXCEPTION(oldProps32) || GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
623 fprintf(stderr, "genprops error: new numeric value for a character with some other value in DerivedNumericValues.txt at %s\n", fields[0][0]);
624 exit(U_PARSE_ERROR);
625 }
626
627 if(isFraction) {
628 fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
629 exit(U_PARSE_ERROR);
630 }
631
632 if(beVerbose) {
374ca955 633 printf("adding U+%04x numeric type %d value %u\n", (int)start, U_NT_NUMERIC, (int)value);
b75a7d8f
A
634 }
635
636 /* reconstruct the properties and set the new numeric type and value */
637 uprv_memset(&newProps, 0, sizeof(newProps));
638 newProps.code=start;
639 newProps.generalCategory=(uint8_t)GET_CATEGORY(oldProps32);
640 newProps.bidi=(uint8_t)GET_BIDI_CLASS(oldProps32);
641 newProps.isMirrored=(uint8_t)(oldProps32&(1UL<<UPROPS_MIRROR_SHIFT) ? TRUE : FALSE);
374ca955
A
642 newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
643 newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
b75a7d8f
A
644 addProps(start, makeProps(&newProps));
645 }
646}
647
648/* DerivedBidiClass.txt ----------------------------------------------------- */
649
650static void U_CALLCONV
651bidiClassLineFn(void *context,
652 char *fields[][2], int32_t fieldCount,
653 UErrorCode *pErrorCode) {
654 char *s;
655 uint32_t oldStart, start, limit, value, props32;
656 UBool didSet;
657
658 /* get the code point range */
659 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
660 if(U_FAILURE(*pErrorCode)) {
661 fprintf(stderr, "genprops: syntax error in DerivedBidiClass.txt field 0 at %s\n", fields[0][0]);
662 exit(*pErrorCode);
663 }
664 ++limit;
665
666 /* parse bidi class */
667 s=trimTerminateField(fields[1][0], fields[1][1]);
668 value=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, s);
374ca955 669 if((int32_t)value<0) {
b75a7d8f
A
670 fprintf(stderr, "genprops error: unknown bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
671 exit(U_PARSE_ERROR);
672 }
673
674 didSet=FALSE;
675 oldStart=start;
676 for(; start<limit; ++start) {
677 props32=getProps(start);
678
679 /* ignore if this bidi class is already set */
680 if(value==GET_BIDI_CLASS(props32)) {
681 continue;
682 }
683
684 /* ignore old bidi class, set only for unassigned code points (Cn) */
685 if(GET_CATEGORY(props32)!=0) {
686 /* error if this one contradicts what we parsed from UnicodeData.txt */
687 fprintf(stderr, "genprops error: different bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
688 exit(U_PARSE_ERROR);
689 }
690
691 /* remove whatever bidi class was set before */
692 props32&=~(0x1f<<UPROPS_BIDI_SHIFT);
693
694 /* set bidi class for Cn according to DerivedBidiClass.txt */
695 props32|=value<<UPROPS_BIDI_SHIFT;
696
697 /* set the modified properties */
698 addProps(start, props32);
699 didSet=TRUE;
700 }
701
702 if(didSet && beVerbose) {
374ca955 703 printf("setting U+%04x..U+%04x bidi class %d\n", (int)oldStart, (int)limit-1, (int)value);
b75a7d8f
A
704 }
705}
706
707/* data serialization ------------------------------------------------------- */
708
709U_CFUNC int32_t
710writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
711 int32_t length;
712 UErrorCode errorCode;
713
714 errorCode=U_ZERO_ERROR;
715 length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
716 if(U_FAILURE(errorCode)) {
717 fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
718 exit(errorCode);
719 }
720 if(p!=NULL) {
721 p+=length;
722 capacity-=length;
723 if(beVerbose) {
374ca955 724 printf("size in bytes of additional props trie:%5u\n", (int)length);
b75a7d8f
A
725 }
726
727 /* set indexes */
728 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
729 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
730 indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
731 indexes[UPROPS_RESERVED_INDEX]=
732 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
733
734 indexes[UPROPS_MAX_VALUES_INDEX]=
735 (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
736 (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
737 (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
738 ((int32_t)USCRIPT_CODE_LIMIT-1);
739 indexes[UPROPS_MAX_VALUES_2_INDEX]=
740 (((int32_t)U_JT_COUNT-1)<<UPROPS_JT_SHIFT)|
741 (((int32_t)U_JG_COUNT-1)<<UPROPS_JG_SHIFT)|
742 ((int32_t)U_DT_COUNT-1);
743 }
744
745 if(p!=NULL && (pvCount*4)<=capacity) {
746 uprv_memcpy(p, pv, pvCount*4);
747 if(beVerbose) {
374ca955 748 printf("number of additional props vectors: %5u\n", (int)pvCount/UPROPS_VECTOR_WORDS);
b75a7d8f
A
749 printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
750 }
751 }
752 length+=pvCount*4;
753
754 if(p!=NULL) {
755 utrie_close(trie);
756 upvec_close(pv);
757 }
758 return length;
759}