1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2011-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2011dec11
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/uchar.h"
30 PropertyNames::~PropertyNames() {}
33 PropertyNames::getPropertyEnum(const char *name
) const {
34 return u_getPropertyEnum(name
);
38 PropertyNames::getPropertyValueEnum(int32_t property
, const char *name
) const {
39 return u_getPropertyValueEnum((UProperty
)property
, name
);
43 : start(U_SENTINEL
), end(U_SENTINEL
),
44 bmg(U_SENTINEL
), bpb(U_SENTINEL
),
45 scf(U_SENTINEL
), slc(U_SENTINEL
), stc(U_SENTINEL
), suc(U_SENTINEL
),
46 digitValue(-1), numericValue(NULL
),
47 name(NULL
), nameAlias(NULL
) {
48 memset(binProps
, 0, sizeof(binProps
));
49 memset(intProps
, 0, sizeof(intProps
));
53 UniProps::~UniProps() {}
55 const int32_t PreparsedUCD::kNumLineBuffers
;
57 PreparsedUCD::PreparsedUCD(const char *filename
, UErrorCode
&errorCode
)
58 : icuPnames(new PropertyNames()), pnames(icuPnames
),
60 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
63 fieldLimit(NULL
), lineLimit(NULL
) {
64 if(U_FAILURE(errorCode
)) { return; }
66 if(filename
==NULL
|| *filename
==0 || (*filename
=='-' && filename
[1]==0)) {
70 file
=fopen(filename
, "r");
73 perror("error opening preparsed UCD");
74 fprintf(stderr
, "error opening preparsed UCD file %s\n", filename
? filename
: "\"no file name given\"");
75 errorCode
=U_FILE_ACCESS_ERROR
;
79 memset(ucdVersion
, 0, 4);
83 PreparsedUCD::~PreparsedUCD() {
90 // Same order as the LineType values.
91 static const char *lineTypeStrings
[]={
105 PreparsedUCD::LineType
106 PreparsedUCD::readLine(UErrorCode
&errorCode
) {
107 if(U_FAILURE(errorCode
)) { return NO_LINE
; }
108 // Select the next available line buffer.
109 while(!isLineBufferAvailable(lineIndex
)) {
111 if (lineIndex
== kNumLineBuffers
) {
115 char *line
=lines
[lineIndex
];
117 lineLimit
=fieldLimit
=line
;
119 char *result
=fgets(line
, sizeof(lines
[0]), file
);
122 perror("error reading preparsed UCD");
123 fprintf(stderr
, "error reading preparsed UCD before line %ld\n", (long)lineNumber
);
124 errorCode
=U_FILE_ACCESS_ERROR
;
130 fieldLimit
=strchr(line
, 0);
131 return lineType
=EMPTY_LINE
;
133 // Remove trailing /r/n.
135 char *limit
=strchr(line
, 0);
136 while(line
<limit
&& ((c
=*(limit
-1))=='\n' || c
=='\r')) { --limit
; }
137 // Remove trailing white space.
138 while(line
<limit
&& ((c
=*(limit
-1))==' ' || c
=='\t')) { --limit
; }
143 return lineType
=EMPTY_LINE
;
147 while((semi
=strchr(semi
, ';'))!=NULL
) { *semi
++=0; }
148 fieldLimit
=strchr(line
, 0);
149 // Determine the line type.
151 for(type
=EMPTY_LINE
+1;; ++type
) {
152 if(type
==LINE_TYPE_COUNT
) {
154 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
155 line
, (long)lineNumber
);
156 errorCode
=U_PARSE_ERROR
;
159 if(0==strcmp(line
, lineTypeStrings
[type
])) {
163 lineType
=(LineType
)type
;
164 if(lineType
==UNICODE_VERSION_LINE
&& fieldLimit
<lineLimit
) {
165 u_versionFromString(ucdVersion
, fieldLimit
+1);
171 PreparsedUCD::firstField() {
172 char *field
=lines
[lineIndex
];
173 fieldLimit
=strchr(field
, 0);
178 PreparsedUCD::nextField() {
179 if(fieldLimit
==lineLimit
) { return NULL
; }
180 char *field
=fieldLimit
+1;
181 fieldLimit
=strchr(field
, 0);
186 PreparsedUCD::getProps(UnicodeSet
&newValues
, UErrorCode
&errorCode
) {
187 if(U_FAILURE(errorCode
)) { return NULL
; }
189 if(!lineHasPropertyValues()) {
190 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
194 const char *field
=nextField();
196 // No range field after the type.
198 "error in preparsed UCD: missing default/block/cp range field "
199 "(no second field) on line %ld\n",
201 errorCode
=U_PARSE_ERROR
;
205 if(!parseCodePointRange(field
, start
, end
, errorCode
)) { return NULL
; }
207 UBool insideBlock
=FALSE
; // TRUE if cp or unassigned range inside the block range.
210 // Should occur before any block/cp/unassigned line.
211 if(blockLineIndex
>=0) {
213 "error in preparsed UCD: default line %ld after one or more block lines\n",
215 errorCode
=U_PARSE_ERROR
;
218 if(defaultLineIndex
>=0) {
220 "error in preparsed UCD: second line with default properties on line %ld\n",
222 errorCode
=U_PARSE_ERROR
;
225 if(start
!=0 || end
!=0x10ffff) {
227 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
228 field
, (long)lineNumber
);
229 errorCode
=U_PARSE_ERROR
;
233 defaultLineIndex
=lineIndex
;
236 blockProps
=defaultProps
; // Block inherits default properties.
238 blockLineIndex
=lineIndex
;
241 case UNASSIGNED_LINE
:
242 if(blockProps
.start
<=start
&& end
<=blockProps
.end
) {
244 if(lineType
==CP_LINE
) {
245 // Code point range fully inside the last block inherits the block properties.
248 // Unassigned line inside the block is based on default properties
249 // which override block properties.
250 cpProps
=defaultProps
;
251 newValues
=blockValues
;
252 // Except, it inherits the one blk=Block property.
253 int32_t blkIndex
=UCHAR_BLOCK
-UCHAR_INT_START
;
254 cpProps
.intProps
[blkIndex
]=blockProps
.intProps
[blkIndex
];
255 newValues
.remove((UChar32
)UCHAR_BLOCK
);
257 } else if(start
>blockProps
.end
|| end
<blockProps
.start
) {
258 // Code point range fully outside the last block inherits the default properties.
259 cpProps
=defaultProps
;
261 // Code point range partially overlapping with the last block is illegal.
263 "error in preparsed UCD: cp range %s on line %ld only "
264 "partially overlaps with block range %04lX..%04lX\n",
265 field
, (long)lineNumber
, (long)blockProps
.start
, (long)blockProps
.end
);
266 errorCode
=U_PARSE_ERROR
;
272 // Will not occur because of the range check above.
273 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
278 while((field
=nextField())!=NULL
) {
279 if(!parseProperty(*props
, field
, newValues
, errorCode
)) { return NULL
; }
281 if(lineType
==BLOCK_LINE
) {
282 blockValues
=newValues
;
283 } else if(lineType
==UNASSIGNED_LINE
&& insideBlock
) {
284 // Unset newValues for values that are the same as the block values.
285 for(int32_t prop
=0; prop
<UCHAR_BINARY_LIMIT
; ++prop
) {
286 if(newValues
.contains(prop
) && cpProps
.binProps
[prop
]==blockProps
.binProps
[prop
]) {
287 newValues
.remove(prop
);
290 for(int32_t prop
=UCHAR_INT_START
; prop
<UCHAR_INT_LIMIT
; ++prop
) {
291 int32_t index
=prop
-UCHAR_INT_START
;
292 if(newValues
.contains(prop
) && cpProps
.intProps
[index
]==blockProps
.intProps
[index
]) {
293 newValues
.remove(prop
);
300 static const struct {
303 } ppucdProperties
[]={
304 { "Name_Alias", PPUCD_NAME_ALIAS
},
305 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS
},
306 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING
}
309 // Returns TRUE for "ok to continue parsing fields".
311 PreparsedUCD::parseProperty(UniProps
&props
, const char *field
, UnicodeSet
&newValues
,
312 UErrorCode
&errorCode
) {
315 const char *v
=strchr(p
, '=');
320 "error in preparsed UCD: mix of binary-property-no and "
321 "enum-property syntax '%s' on line %ld\n",
322 field
, (long)lineNumber
);
323 errorCode
=U_PARSE_ERROR
;
332 // Copy out the property name rather than modifying the field (writing a NUL).
333 pBuffer
.append(p
, (int32_t)(v
-p
), errorCode
);
337 int32_t prop
=pnames
->getPropertyEnum(p
);
339 for(int32_t i
=0;; ++i
) {
340 if(i
==UPRV_LENGTHOF(ppucdProperties
)) {
341 // Ignore unknown property names.
344 if(0==uprv_stricmp(p
, ppucdProperties
[i
].name
)) {
345 prop
=ppucdProperties
[i
].prop
;
351 if(prop
<UCHAR_BINARY_LIMIT
) {
353 props
.binProps
[prop
]=(UBool
)binaryValue
;
355 // No binary value for a binary property.
357 "error in preparsed UCD: enum-property syntax '%s' "
358 "for binary property on line %ld\n",
359 field
, (long)lineNumber
);
360 errorCode
=U_PARSE_ERROR
;
362 } else if(binaryValue
>=0) {
363 // Binary value for a non-binary property.
365 "error in preparsed UCD: binary-property syntax '%s' "
366 "for non-binary property on line %ld\n",
367 field
, (long)lineNumber
);
368 errorCode
=U_PARSE_ERROR
;
369 } else if (prop
< UCHAR_INT_START
) {
371 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
372 prop
, (long)lineNumber
);
373 errorCode
=U_PARSE_ERROR
;
374 } else if(prop
<UCHAR_INT_LIMIT
) {
375 int32_t value
=pnames
->getPropertyValueEnum(prop
, v
);
376 if(value
==UCHAR_INVALID_CODE
&& prop
==UCHAR_CANONICAL_COMBINING_CLASS
) {
377 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
379 unsigned long ccc
=uprv_strtoul(v
, &end
, 10);
380 if(v
<end
&& *end
==0 && ccc
<=254) {
384 if(value
==UCHAR_INVALID_CODE
) {
386 "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
387 field
, (long)lineNumber
);
388 errorCode
=U_PARSE_ERROR
;
390 props
.intProps
[prop
-UCHAR_INT_START
]=value
;
393 // Do not parse default values like <code point>, just set null values.
395 case UCHAR_BIDI_MIRRORING_GLYPH
:
396 props
.bmg
=U_SENTINEL
;
398 case UCHAR_BIDI_PAIRED_BRACKET
:
399 props
.bpb
=U_SENTINEL
;
401 case UCHAR_SIMPLE_CASE_FOLDING
:
402 props
.scf
=U_SENTINEL
;
404 case UCHAR_SIMPLE_LOWERCASE_MAPPING
:
405 props
.slc
=U_SENTINEL
;
407 case UCHAR_SIMPLE_TITLECASE_MAPPING
:
408 props
.stc
=U_SENTINEL
;
410 case UCHAR_SIMPLE_UPPERCASE_MAPPING
:
411 props
.suc
=U_SENTINEL
;
413 case UCHAR_CASE_FOLDING
:
416 case UCHAR_LOWERCASE_MAPPING
:
419 case UCHAR_TITLECASE_MAPPING
:
422 case UCHAR_UPPERCASE_MAPPING
:
425 case UCHAR_SCRIPT_EXTENSIONS
:
430 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
431 field
, (long)lineNumber
);
432 errorCode
=U_PARSE_ERROR
;
437 case UCHAR_NUMERIC_VALUE
:
438 props
.numericValue
=v
;
440 if('0'<=c
&& c
<='9' && v
[1]==0) {
441 props
.digitValue
=c
-'0';
450 u_versionFromString(props
.age
, v
); // Writes 0.0.0.0 if v is not numeric.
452 case UCHAR_BIDI_MIRRORING_GLYPH
:
453 props
.bmg
=parseCodePoint(v
, errorCode
);
455 case UCHAR_BIDI_PAIRED_BRACKET
:
456 props
.bpb
=parseCodePoint(v
, errorCode
);
458 case UCHAR_SIMPLE_CASE_FOLDING
:
459 props
.scf
=parseCodePoint(v
, errorCode
);
461 case UCHAR_SIMPLE_LOWERCASE_MAPPING
:
462 props
.slc
=parseCodePoint(v
, errorCode
);
464 case UCHAR_SIMPLE_TITLECASE_MAPPING
:
465 props
.stc
=parseCodePoint(v
, errorCode
);
467 case UCHAR_SIMPLE_UPPERCASE_MAPPING
:
468 props
.suc
=parseCodePoint(v
, errorCode
);
470 case UCHAR_CASE_FOLDING
:
471 parseString(v
, props
.cf
, errorCode
);
473 case UCHAR_LOWERCASE_MAPPING
:
474 parseString(v
, props
.lc
, errorCode
);
476 case UCHAR_TITLECASE_MAPPING
:
477 parseString(v
, props
.tc
, errorCode
);
479 case UCHAR_UPPERCASE_MAPPING
:
480 parseString(v
, props
.uc
, errorCode
);
482 case PPUCD_NAME_ALIAS
:
485 case PPUCD_CONDITIONAL_CASE_MAPPINGS
:
486 case PPUCD_TURKIC_CASE_FOLDING
:
487 // No need to parse their values: They are hardcoded in the runtime library.
489 case UCHAR_SCRIPT_EXTENSIONS
:
490 parseScriptExtensions(v
, props
.scx
, errorCode
);
493 // Ignore unhandled properties.
497 if(U_SUCCESS(errorCode
)) {
498 newValues
.add((UChar32
)prop
);
506 PreparsedUCD::getRangeForAlgNames(UChar32
&start
, UChar32
&end
, UErrorCode
&errorCode
) {
507 if(U_FAILURE(errorCode
)) { return FALSE
; }
508 if(lineType
!=ALG_NAMES_RANGE_LINE
) {
509 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
513 const char *field
=nextField();
515 // No range field after the type.
517 "error in preparsed UCD: missing algnamesrange range field "
518 "(no second field) on line %ld\n",
520 errorCode
=U_PARSE_ERROR
;
523 return parseCodePointRange(field
, start
, end
, errorCode
);
527 PreparsedUCD::parseCodePoint(const char *s
, UErrorCode
&errorCode
) {
529 uint32_t value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
530 if(end
<=s
|| *end
!=0 || value
>=0x110000) {
532 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
533 s
, (long)lineNumber
);
534 errorCode
=U_PARSE_ERROR
;
537 return (UChar32
)value
;
541 PreparsedUCD::parseCodePointRange(const char *s
, UChar32
&start
, UChar32
&end
, UErrorCode
&errorCode
) {
543 u_parseCodePointRange(s
, &st
, &e
, &errorCode
);
544 if(U_FAILURE(errorCode
)) {
546 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
547 s
, (long)lineNumber
);
556 PreparsedUCD::parseString(const char *s
, UnicodeString
&uni
, UErrorCode
&errorCode
) {
557 UChar
*buffer
=toUCharPtr(uni
.getBuffer(-1));
558 int32_t length
=u_parseString(s
, buffer
, uni
.getCapacity(), NULL
, &errorCode
);
559 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
560 errorCode
=U_ZERO_ERROR
;
561 uni
.releaseBuffer(0);
562 buffer
=toUCharPtr(uni
.getBuffer(length
));
563 length
=u_parseString(s
, buffer
, uni
.getCapacity(), NULL
, &errorCode
);
565 uni
.releaseBuffer(length
);
566 if(U_FAILURE(errorCode
)) {
568 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
569 s
, (long)lineNumber
);
574 PreparsedUCD::parseScriptExtensions(const char *s
, UnicodeSet
&scx
, UErrorCode
&errorCode
) {
575 if(U_FAILURE(errorCode
)) { return; }
580 const char *scLimit
=strchr(s
, ' ');
582 scs
=scString
.clear().append(s
, (int32_t)(scLimit
-s
), errorCode
).data();
583 if(U_FAILURE(errorCode
)) { return; }
587 int32_t script
=pnames
->getPropertyValueEnum(UCHAR_SCRIPT
, scs
);
588 if(script
==UCHAR_INVALID_CODE
) {
590 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
591 scs
, (long)lineNumber
);
592 errorCode
=U_PARSE_ERROR
;
594 } else if(scx
.contains(script
)) {
596 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
597 scs
, (long)lineNumber
);
598 errorCode
=U_PARSE_ERROR
;
610 fprintf(stderr
, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber
);
611 errorCode
=U_PARSE_ERROR
;