]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genprops/props2.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / tools / genprops / props2.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 2002-2003, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: props2.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002feb24
14* created by: Markus W. Scherer
15*
16* Parse more Unicode Character Database files and store
17* additional Unicode character properties in bit set vectors.
18*/
19
20#include <stdio.h>
21#include "unicode/utypes.h"
22#include "unicode/uchar.h"
23#include "unicode/uscript.h"
24#include "cstring.h"
25#include "cmemory.h"
26#include "utrie.h"
27#include "uprops.h"
28#include "propsvec.h"
29#include "uparse.h"
30#include "genprops.h"
31
32#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
33
34/* data --------------------------------------------------------------------- */
35
36static UNewTrie *trie;
37uint32_t *pv;
38static int32_t pvCount;
39
40/* miscellaneous ------------------------------------------------------------ */
41
42static char *
43trimTerminateField(char *s, char *limit) {
44 /* trim leading whitespace */
45 s=(char *)u_skipWhitespace(s);
46
47 /* trim trailing whitespace */
48 while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
49 --limit;
50 }
51 *limit=0;
52
53 return s;
54}
55
56static void
57parseTwoFieldFile(char *filename, char *basename,
58 const char *ucdFile, const char *suffix,
59 UParseLineFn *lineFn,
60 UErrorCode *pErrorCode) {
61 char *fields[2][2];
62
63 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
64 return;
65 }
66
67 writeUCDFilename(basename, ucdFile, suffix);
68
69 u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
70 if(U_FAILURE(*pErrorCode)) {
71 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
72 }
73}
74
75static void U_CALLCONV
76ageLineFn(void *context,
77 char *fields[][2], int32_t fieldCount,
78 UErrorCode *pErrorCode);
79
80static void
81parseMultiFieldFile(char *filename, char *basename,
82 const char *ucdFile, const char *suffix,
83 int32_t fieldCount,
84 UParseLineFn *lineFn,
85 UErrorCode *pErrorCode) {
86 char *fields[20][2];
87
88 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
89 return;
90 }
91
92 writeUCDFilename(basename, ucdFile, suffix);
93
94 u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
95 if(U_FAILURE(*pErrorCode)) {
96 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
97 }
98}
99
100static void U_CALLCONV
101numericLineFn(void *context,
102 char *fields[][2], int32_t fieldCount,
103 UErrorCode *pErrorCode);
104
105static void U_CALLCONV
106bidiClassLineFn(void *context,
107 char *fields[][2], int32_t fieldCount,
108 UErrorCode *pErrorCode);
109
110/* parse files with single enumerated properties ---------------------------- */
111
112struct SingleEnum {
113 const char *ucdFile, *propName;
114 UProperty prop;
115 int32_t vecWord, vecShift;
116 uint32_t vecMask;
117};
118typedef struct SingleEnum SingleEnum;
119
120static void
121parseSingleEnumFile(char *filename, char *basename, const char *suffix,
122 const SingleEnum *sen,
123 UErrorCode *pErrorCode);
124
125static const SingleEnum scriptSingleEnum={
126 "Scripts", "script",
127 UCHAR_SCRIPT,
128 0, 0, UPROPS_SCRIPT_MASK
129};
130
131static const SingleEnum blockSingleEnum={
132 "Blocks", "block",
133 UCHAR_BLOCK,
134 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
135};
136
137static const SingleEnum lineBreakSingleEnum={
138 "LineBreak", "line break",
139 UCHAR_LINE_BREAK,
140 0, UPROPS_LB_SHIFT, UPROPS_LB_MASK
141};
142
143static const SingleEnum eawSingleEnum={
144 "EastAsianWidth", "east asian width",
145 UCHAR_EAST_ASIAN_WIDTH,
146 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
147};
148
149static const SingleEnum jtSingleEnum={
150 "DerivedJoiningType", "joining type",
151 UCHAR_JOINING_TYPE,
152 2, UPROPS_JT_SHIFT, UPROPS_JT_MASK
153};
154
155static const SingleEnum jgSingleEnum={
156 "DerivedJoiningGroup", "joining group",
157 UCHAR_JOINING_GROUP,
158 2, UPROPS_JG_SHIFT, UPROPS_JG_MASK
159};
160
161static void U_CALLCONV
162singleEnumLineFn(void *context,
163 char *fields[][2], int32_t fieldCount,
164 UErrorCode *pErrorCode) {
165 const SingleEnum *sen;
166 char *s;
167 uint32_t start, limit, uv;
168 int32_t value;
169
170 sen=(const SingleEnum *)context;
171
172 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
173 if(U_FAILURE(*pErrorCode)) {
174 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
175 exit(*pErrorCode);
176 }
177 ++limit;
178
179 /* parse property alias */
180 s=trimTerminateField(fields[1][0], fields[1][1]);
181 value=u_getPropertyValueEnum(sen->prop, s);
182 if(value<0) {
183 if(sen->prop==UCHAR_BLOCK) {
184 if(isToken("Greek", s)) {
185 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
186 } else if(isToken("Combining Marks for Symbols", s)) {
187 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
188 } else if(isToken("Private Use", s)) {
189 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
190 }
191 }
192 }
193 if(value<0) {
194 fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
195 sen->propName, sen->ucdFile, s);
196 exit(U_PARSE_ERROR);
197 }
198
199 uv=(uint32_t)(value<<sen->vecShift);
200 if((uv&sen->vecMask)!=uv) {
201 fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
202 sen->propName, uv, s);
203 exit(U_INTERNAL_PROGRAM_ERROR);
204 }
205
206 if(!upvec_setValue(pv, start, limit, sen->vecWord, uv, sen->vecMask, pErrorCode)) {
207 fprintf(stderr, "genprops error: unable to set %s code: %s\n",
208 sen->propName, u_errorName(*pErrorCode));
209 exit(*pErrorCode);
210 }
211}
212
213static void
214parseSingleEnumFile(char *filename, char *basename, const char *suffix,
215 const SingleEnum *sen,
216 UErrorCode *pErrorCode) {
217 char *fields[2][2];
218
219 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
220 return;
221 }
222
223 writeUCDFilename(basename, sen->ucdFile, suffix);
224
225 u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
226 if(U_FAILURE(*pErrorCode)) {
227 fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
228 }
229}
230
231/* parse files with multiple binary properties ------------------------------ */
232
233struct Binary {
234 const char *propName;
235 int32_t vecWord, vecShift;
236};
237typedef struct Binary Binary;
238
239struct Binaries {
240 const char *ucdFile;
241 const Binary *binaries;
242 int32_t binariesCount;
243};
244typedef struct Binaries Binaries;
245
246static const Binary
247propListNames[]={
248 { "White_Space", 1, UPROPS_WHITE_SPACE },
249 { "Bidi_Control", 1, UPROPS_BIDI_CONTROL },
250 { "Join_Control", 1, UPROPS_JOIN_CONTROL },
251 { "Dash", 1, UPROPS_DASH },
252 { "Hyphen", 1, UPROPS_HYPHEN },
253 { "Quotation_Mark", 1, UPROPS_QUOTATION_MARK },
254 { "Terminal_Punctuation", 1, UPROPS_TERMINAL_PUNCTUATION },
255 { "Hex_Digit", 1, UPROPS_HEX_DIGIT },
256 { "ASCII_Hex_Digit", 1, UPROPS_ASCII_HEX_DIGIT },
257 { "Ideographic", 1, UPROPS_IDEOGRAPHIC },
258 { "Diacritic", 1, UPROPS_DIACRITIC },
259 { "Extender", 1, UPROPS_EXTENDER },
260 { "Noncharacter_Code_Point", 1, UPROPS_NONCHARACTER_CODE_POINT },
261 { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK },
262 { "IDS_Binary_Operator", 1, UPROPS_IDS_BINARY_OPERATOR },
263 { "IDS_Trinary_Operator", 1, UPROPS_IDS_TRINARY_OPERATOR },
264 { "Radical", 1, UPROPS_RADICAL },
265 { "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH },
266 { "Deprecated", 1, UPROPS_DEPRECATED },
267 { "Soft_Dotted", 1, UPROPS_SOFT_DOTTED },
268 { "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION }
269};
270
271static const Binaries
272propListBinaries={
273 "PropList", propListNames, LENGTHOF(propListNames)
274};
275
276static const Binary
277derCorePropsNames[]={
278 { "XID_Start", 1, UPROPS_XID_START },
279 { "XID_Continue", 1, UPROPS_XID_CONTINUE },
280
281 /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
282 { "Math", 1, UPROPS_MATH },
283 { "Alphabetic", 1, UPROPS_ALPHABETIC },
284 { "Lowercase", 1, UPROPS_LOWERCASE },
285 { "Uppercase", 1, UPROPS_UPPERCASE },
286 { "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND },
287 { "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
288
289 /* new properties bits in ICU 2.6/format version 3.2 */
290 { "ID_Start", 1, UPROPS_ID_START },
291 { "ID_Continue", 1, UPROPS_ID_CONTINUE },
292 { "Grapheme_Base", 1, UPROPS_GRAPHEME_BASE }
293};
294
295static const Binaries
296derCorePropsBinaries={
297 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
298};
299
300static char ignoredProps[100][64];
301static int32_t ignoredPropsCount;
302
303static void
304addIgnoredProp(char *s, char *limit) {
305 int32_t i;
306
307 s=trimTerminateField(s, limit);
308 for(i=0; i<ignoredPropsCount; ++i) {
309 if(0==uprv_strcmp(ignoredProps[i], s)) {
310 return;
311 }
312 }
313 uprv_strcpy(ignoredProps[ignoredPropsCount++], s);
314}
315
316static void U_CALLCONV
317binariesLineFn(void *context,
318 char *fields[][2], int32_t fieldCount,
319 UErrorCode *pErrorCode) {
320 const Binaries *bin;
321 char *s;
322 uint32_t start, limit, uv;
323 int32_t i;
324
325 bin=(const Binaries *)context;
326
327 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
328 if(U_FAILURE(*pErrorCode)) {
329 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
330 exit(*pErrorCode);
331 }
332 ++limit;
333
334 /* parse binary property name */
335 s=(char *)u_skipWhitespace(fields[1][0]);
336 for(i=0;; ++i) {
337 if(i==bin->binariesCount) {
338 /* ignore unrecognized properties */
339 addIgnoredProp(s, fields[1][1]);
340 return;
341 }
342 if(isToken(bin->binaries[i].propName, s)) {
343 break;
344 }
345 }
346
347 if(bin->binaries[i].vecShift>=32) {
348 fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
349 bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
350 exit(U_INTERNAL_PROGRAM_ERROR);
351 }
352 uv=U_MASK(bin->binaries[i].vecShift);
353
354 if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, uv, uv, pErrorCode)) {
355 fprintf(stderr, "genprops error: unable to set %s code: %s\n",
356 bin->binaries[i].propName, u_errorName(*pErrorCode));
357 exit(*pErrorCode);
358 }
359}
360
361static void
362parseBinariesFile(char *filename, char *basename, const char *suffix,
363 const Binaries *bin,
364 UErrorCode *pErrorCode) {
365 char *fields[2][2];
366 int32_t i;
367
368 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
369 return;
370 }
371
372 writeUCDFilename(basename, bin->ucdFile, suffix);
373
374 ignoredPropsCount=0;
375
376 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
377 if(U_FAILURE(*pErrorCode)) {
378 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
379 }
380
381 for(i=0; i<ignoredPropsCount; ++i) {
382 printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
383 }
384}
385
386/* -------------------------------------------------------------------------- */
387
388U_CFUNC void
389initAdditionalProperties() {
390 pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
391}
392
393U_CFUNC void
394generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
395 char *basename;
396
397 basename=filename+uprv_strlen(filename);
398
399 /* process various UCD .txt files */
400
401 /* add Han numeric types & values */
402 parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 3, numericLineFn, pErrorCode);
403
404 /* set proper bidi class for unassigned code points (Cn) */
405 parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, pErrorCode);
406
407 parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
408
409 /*
410 * UTR 24 says:
411 * Section 2:
412 * "Common - For characters that may be used
413 * within multiple scripts,
414 * or any unassigned code points."
415 *
416 * Section 4:
417 * "The value COMMON is the default value,
418 * given to all code points that are not
419 * explicitly mentioned in the data file."
420 *
421 * COMMON==USCRIPT_COMMON==0 - nothing to do
422 */
423 parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
424
425 parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
426
427 parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
428
429 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);
430
431 /*
432 * LineBreak-4.0.0.txt:
433 * - All code points, assigned and unassigned, that are not listed
434 * explicitly are given the value "XX".
435 *
436 * XX==U_LB_UNKNOWN==0 - nothing to do
437 */
438 parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
439
440 parseSingleEnumFile(filename, basename, suffix, &jtSingleEnum, pErrorCode);
441
442 parseSingleEnumFile(filename, basename, suffix, &jgSingleEnum, pErrorCode);
443
444 /*
445 * Preset East Asian Width defaults:
446 *
447 * http://www.unicode.org/reports/tr11/#Unassigned
448 * 7.1 Unassigned and Private Use characters
449 *
450 * All unassigned characters are by default classified as non-East Asian neutral,
451 * except for the range U+20000 to U+2FFFD,
452 * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
453 * All Private use characters are by default classified as ambiguous,
454 * since their definition depends on context.
455 *
456 * N for all ==0 - nothing to do
457 * A for Private Use
458 * W for plane 2
459 */
460 *pErrorCode=U_ZERO_ERROR;
461 if( !upvec_setValue(pv, 0xe000, 0xf900, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
462 !upvec_setValue(pv, 0xf0000, 0xffffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
463 !upvec_setValue(pv, 0x100000, 0x10fffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
464 !upvec_setValue(pv, 0x20000, 0x2fffe, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)
465 ) {
466 fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
467 exit(*pErrorCode);
468 }
469
470 /* parse EastAsianWidth.txt */
471 parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
472
473 trie=utrie_open(NULL, NULL, 50000, 0, TRUE);
474 if(trie==NULL) {
475 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
476 upvec_close(pv);
477 return;
478 }
479
480 pvCount=upvec_toTrie(pv, trie, pErrorCode);
481 if(U_FAILURE(*pErrorCode)) {
482 fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
483 exit(*pErrorCode);
484 }
485}
486
487/* DerivedAge.txt ----------------------------------------------------------- */
488
489static void U_CALLCONV
490ageLineFn(void *context,
491 char *fields[][2], int32_t fieldCount,
492 UErrorCode *pErrorCode) {
493 char *s, *end;
494 uint32_t value, start, limit, version;
495
496 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
497 if(U_FAILURE(*pErrorCode)) {
498 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
499 exit(*pErrorCode);
500 }
501 ++limit;
502
503 /* parse version number */
504 s=(char *)u_skipWhitespace(fields[1][0]);
505 value=(uint32_t)uprv_strtoul(s, &end, 10);
506 if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
507 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
508 *pErrorCode=U_PARSE_ERROR;
509 exit(U_PARSE_ERROR);
510 }
511 version=value<<4;
512
513 /* parse minor version number */
514 if(*end=='.') {
515 s=(char *)u_skipWhitespace(end+1);
516 value=(uint32_t)uprv_strtoul(s, &end, 10);
517 if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) {
518 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
519 *pErrorCode=U_PARSE_ERROR;
520 exit(U_PARSE_ERROR);
521 }
522 version|=value;
523 }
524
525 if(!upvec_setValue(pv, start, limit, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode)) {
526 fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
527 exit(*pErrorCode);
528 }
529}
530
531/* DerivedNumericValues.txt ------------------------------------------------- */
532
533static void U_CALLCONV
534numericLineFn(void *context,
535 char *fields[][2], int32_t fieldCount,
536 UErrorCode *pErrorCode) {
537 Props newProps;
538 char *s, *end;
539 uint32_t start, limit, value, oldProps32;
540 int32_t type, oldType;
541 char c;
542 UBool isFraction;
543
544 /* get the code point range */
545 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
546 if(U_FAILURE(*pErrorCode)) {
547 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
548 exit(*pErrorCode);
549 }
550 ++limit;
551
552 /* check if the numeric value is a fraction (this code does not handle any) */
553 isFraction=FALSE;
554 s=uprv_strchr(fields[1][0], '.');
555 if(s!=NULL) {
556 end=s+1;
557 while('0'<=(c=*end++) && c<='9') {
558 if(c!='0') {
559 isFraction=TRUE;
560 break;
561 }
562 }
563 }
564
565 if(isFraction) {
566 value=0;
567 } else {
568 /* parse numeric value */
569 s=(char *)u_skipWhitespace(fields[1][0]);
570
571 /* try large powers of 10 first, may otherwise overflow strtoul() */
572 if(0==uprv_strncmp(s, "10000000000", 11)) {
573 /* large powers of 10 are encoded in a special way, see store.c */
574 value=0x7fffff00;
575 end=s;
576 while(*(++end)=='0') {
577 ++value;
578 }
579 } else {
580 /* normal number parsing */
581 value=(uint32_t)uprv_strtoul(s, &end, 10);
582 }
583 if(end<=s || (*end!='.' && u_skipWhitespace(end)!=fields[1][1]) || value>=0x80000000) {
584 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
585 exit(U_PARSE_ERROR);
586 }
587 }
588
589 /* parse numeric type */
590 s=trimTerminateField(fields[2][0], fields[2][1]);
591 type=u_getPropertyValueEnum(UCHAR_NUMERIC_TYPE, s);
592 if(type<=0) {
593 fprintf(stderr, "genprops error: unknown numeric type in DerivedNumericValues.txt field 1 at %s\n", s);
594 exit(U_PARSE_ERROR);
595 }
596
597 for(; start<limit; ++start) {
598 oldProps32=getProps(start);
599 oldType=(int32_t)GET_NUMERIC_TYPE(oldProps32);
600 if(oldType==type) {
601 /* this code point was already listed with its numeric value in UnicodeData.txt */
602 continue;
603 }
604 if(oldType!=0) {
605 /* the numeric type differs from what we got from UnicodeData.txt */
606 fprintf(stderr, "genprops error: new numeric value for an already numeric character in DerivedNumericValues.txt at %s\n", fields[0][0]);
607 exit(U_PARSE_ERROR);
608 }
609
610 /*
611 * Do not set a numeric value for code points that have other
612 * values or exceptions because the code below is not prepared
613 * to maintain such values and exceptions.
614 *
615 * Check store.c (e.g., file format description and makeProps())
616 * for details of what code points get their value field interpreted.
617 * For example, case mappings for Ll/Lt/Lu and mirror mappings for mirrored characters.
618 *
619 * For simplicity, and because we only expect to set numeric values for Han characters,
620 * for now we only allow to set these values for Lo characters.
621 */
622 if(GET_UNSIGNED_VALUE(oldProps32)!=0 || PROPS_VALUE_IS_EXCEPTION(oldProps32) || GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
623 fprintf(stderr, "genprops error: new numeric value for a character with some other value in DerivedNumericValues.txt at %s\n", fields[0][0]);
624 exit(U_PARSE_ERROR);
625 }
626
627 if(isFraction) {
628 fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
629 exit(U_PARSE_ERROR);
630 }
631
632 if(beVerbose) {
633 printf("adding U+%04x numeric type %d value %u\n", start, type, value);
634 }
635
636 /* reconstruct the properties and set the new numeric type and value */
637 uprv_memset(&newProps, 0, sizeof(newProps));
638 newProps.code=start;
639 newProps.generalCategory=(uint8_t)GET_CATEGORY(oldProps32);
640 newProps.bidi=(uint8_t)GET_BIDI_CLASS(oldProps32);
641 newProps.isMirrored=(uint8_t)(oldProps32&(1UL<<UPROPS_MIRROR_SHIFT) ? TRUE : FALSE);
642 newProps.numericType=(uint8_t)type; /* newly parsed numeric type */
643 newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
644 addProps(start, makeProps(&newProps));
645 }
646}
647
648/* DerivedBidiClass.txt ----------------------------------------------------- */
649
650static void U_CALLCONV
651bidiClassLineFn(void *context,
652 char *fields[][2], int32_t fieldCount,
653 UErrorCode *pErrorCode) {
654 char *s;
655 uint32_t oldStart, start, limit, value, props32;
656 UBool didSet;
657
658 /* get the code point range */
659 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
660 if(U_FAILURE(*pErrorCode)) {
661 fprintf(stderr, "genprops: syntax error in DerivedBidiClass.txt field 0 at %s\n", fields[0][0]);
662 exit(*pErrorCode);
663 }
664 ++limit;
665
666 /* parse bidi class */
667 s=trimTerminateField(fields[1][0], fields[1][1]);
668 value=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, s);
669 if(value<0) {
670 fprintf(stderr, "genprops error: unknown bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
671 exit(U_PARSE_ERROR);
672 }
673
674 didSet=FALSE;
675 oldStart=start;
676 for(; start<limit; ++start) {
677 props32=getProps(start);
678
679 /* ignore if this bidi class is already set */
680 if(value==GET_BIDI_CLASS(props32)) {
681 continue;
682 }
683
684 /* ignore old bidi class, set only for unassigned code points (Cn) */
685 if(GET_CATEGORY(props32)!=0) {
686 /* error if this one contradicts what we parsed from UnicodeData.txt */
687 fprintf(stderr, "genprops error: different bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
688 exit(U_PARSE_ERROR);
689 }
690
691 /* remove whatever bidi class was set before */
692 props32&=~(0x1f<<UPROPS_BIDI_SHIFT);
693
694 /* set bidi class for Cn according to DerivedBidiClass.txt */
695 props32|=value<<UPROPS_BIDI_SHIFT;
696
697 /* set the modified properties */
698 addProps(start, props32);
699 didSet=TRUE;
700 }
701
702 if(didSet && beVerbose) {
703 printf("setting U+%04x..U+%04x bidi class %d\n", oldStart, limit-1, value);
704 }
705}
706
707/* data serialization ------------------------------------------------------- */
708
709U_CFUNC int32_t
710writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
711 int32_t length;
712 UErrorCode errorCode;
713
714 errorCode=U_ZERO_ERROR;
715 length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
716 if(U_FAILURE(errorCode)) {
717 fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
718 exit(errorCode);
719 }
720 if(p!=NULL) {
721 p+=length;
722 capacity-=length;
723 if(beVerbose) {
724 printf("size in bytes of additional props trie:%5u\n", length);
725 }
726
727 /* set indexes */
728 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
729 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
730 indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
731 indexes[UPROPS_RESERVED_INDEX]=
732 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
733
734 indexes[UPROPS_MAX_VALUES_INDEX]=
735 (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
736 (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
737 (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
738 ((int32_t)USCRIPT_CODE_LIMIT-1);
739 indexes[UPROPS_MAX_VALUES_2_INDEX]=
740 (((int32_t)U_JT_COUNT-1)<<UPROPS_JT_SHIFT)|
741 (((int32_t)U_JG_COUNT-1)<<UPROPS_JG_SHIFT)|
742 ((int32_t)U_DT_COUNT-1);
743 }
744
745 if(p!=NULL && (pvCount*4)<=capacity) {
746 uprv_memcpy(p, pv, pvCount*4);
747 if(beVerbose) {
748 printf("number of additional props vectors: %5u\n", pvCount/UPROPS_VECTOR_WORDS);
749 printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
750 }
751 }
752 length+=pvCount*4;
753
754 if(p!=NULL) {
755 utrie_close(trie);
756 upvec_close(pv);
757 }
758 return length;
759}