2 *******************************************************************************
3 * Copyright (C) 2011-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * tab size: 8 (not used)
11 * created on: 2011dec11
12 * created by: Markus W. Scherer
15 #include "unicode/utypes.h"
16 #include "unicode/uchar.h"
28 PropertyNames::~PropertyNames() {}
31 PropertyNames::getPropertyEnum(const char *name
) const {
32 return u_getPropertyEnum(name
);
36 PropertyNames::getPropertyValueEnum(int32_t property
, const char *name
) const {
37 return u_getPropertyValueEnum((UProperty
)property
, name
);
41 : start(U_SENTINEL
), end(U_SENTINEL
),
42 bmg(U_SENTINEL
), bpb(U_SENTINEL
),
43 scf(U_SENTINEL
), slc(U_SENTINEL
), stc(U_SENTINEL
), suc(U_SENTINEL
),
44 digitValue(-1), numericValue(NULL
),
45 name(NULL
), nameAlias(NULL
) {
46 memset(binProps
, 0, sizeof(binProps
));
47 memset(intProps
, 0, sizeof(intProps
));
51 UniProps::~UniProps() {}
53 const int32_t PreparsedUCD::kNumLineBuffers
;
55 PreparsedUCD::PreparsedUCD(const char *filename
, UErrorCode
&errorCode
)
56 : icuPnames(new PropertyNames()), pnames(icuPnames
),
58 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
61 fieldLimit(NULL
), lineLimit(NULL
) {
62 if(U_FAILURE(errorCode
)) { return; }
64 if(filename
==NULL
|| *filename
==0 || (*filename
=='-' && filename
[1]==0)) {
68 file
=fopen(filename
, "r");
71 perror("error opening preparsed UCD");
72 fprintf(stderr
, "error opening preparsed UCD file %s\n", filename
? filename
: "\"no file name given\"");
73 errorCode
=U_FILE_ACCESS_ERROR
;
77 memset(ucdVersion
, 0, 4);
81 PreparsedUCD::~PreparsedUCD() {
88 // Same order as the LineType values.
89 static const char *lineTypeStrings
[]={
102 PreparsedUCD::LineType
103 PreparsedUCD::readLine(UErrorCode
&errorCode
) {
104 if(U_FAILURE(errorCode
)) { return NO_LINE
; }
105 // Select the next available line buffer.
106 while(!isLineBufferAvailable(lineIndex
)) {
108 if (lineIndex
== kNumLineBuffers
) {
112 char *line
=lines
[lineIndex
];
114 lineLimit
=fieldLimit
=line
;
116 char *result
=fgets(line
, sizeof(lines
[0]), file
);
119 perror("error reading preparsed UCD");
120 fprintf(stderr
, "error reading preparsed UCD before line %ld\n", (long)lineNumber
);
121 errorCode
=U_FILE_ACCESS_ERROR
;
127 fieldLimit
=strchr(line
, 0);
128 return lineType
=EMPTY_LINE
;
130 // Remove trailing /r/n.
132 char *limit
=strchr(line
, 0);
133 while(line
<limit
&& ((c
=*(limit
-1))=='\n' || c
=='\r')) { --limit
; }
134 // Remove trailing white space.
135 while(line
<limit
&& ((c
=*(limit
-1))==' ' || c
=='\t')) { --limit
; }
140 return lineType
=EMPTY_LINE
;
144 while((semi
=strchr(semi
, ';'))!=NULL
) { *semi
++=0; }
145 fieldLimit
=strchr(line
, 0);
146 // Determine the line type.
148 for(type
=EMPTY_LINE
+1;; ++type
) {
149 if(type
==LINE_TYPE_COUNT
) {
151 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
152 line
, (long)lineNumber
);
153 errorCode
=U_PARSE_ERROR
;
156 if(0==strcmp(line
, lineTypeStrings
[type
])) {
160 lineType
=(LineType
)type
;
161 if(lineType
==UNICODE_VERSION_LINE
&& fieldLimit
<lineLimit
) {
162 u_versionFromString(ucdVersion
, fieldLimit
+1);
168 PreparsedUCD::firstField() {
169 char *field
=lines
[lineIndex
];
170 fieldLimit
=strchr(field
, 0);
175 PreparsedUCD::nextField() {
176 if(fieldLimit
==lineLimit
) { return NULL
; }
177 char *field
=fieldLimit
+1;
178 fieldLimit
=strchr(field
, 0);
183 PreparsedUCD::getProps(UnicodeSet
&newValues
, UErrorCode
&errorCode
) {
184 if(U_FAILURE(errorCode
)) { return NULL
; }
186 if(!lineHasPropertyValues()) {
187 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
191 const char *field
=nextField();
193 // No range field after the type.
195 "error in preparsed UCD: missing default/block/cp range field "
196 "(no second field) on line %ld\n",
198 errorCode
=U_PARSE_ERROR
;
202 if(!parseCodePointRange(field
, start
, end
, errorCode
)) { return NULL
; }
206 if(defaultLineIndex
>=0) {
208 "error in preparsed UCD: second line with default properties on line %ld\n",
210 errorCode
=U_PARSE_ERROR
;
213 if(start
!=0 || end
!=0x10ffff) {
215 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
216 field
, (long)lineNumber
);
217 errorCode
=U_PARSE_ERROR
;
221 defaultLineIndex
=lineIndex
;
224 blockProps
=defaultProps
; // Block inherits default properties.
226 blockLineIndex
=lineIndex
;
229 if(blockProps
.start
<=start
&& end
<=blockProps
.end
) {
230 // Code point range fully inside the last block inherits the block properties.
232 } else if(start
>blockProps
.end
|| end
<blockProps
.start
) {
233 // Code point range fully outside the last block inherits the default properties.
234 cpProps
=defaultProps
;
236 // Code point range partially overlapping with the last block is illegal.
238 "error in preparsed UCD: cp range %s on line %ld only "
239 "partially overlaps with block range %04lX..%04lX\n",
240 field
, (long)lineNumber
, (long)blockProps
.start
, (long)blockProps
.end
);
241 errorCode
=U_PARSE_ERROR
;
247 // Will not occur because of the range check above.
248 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
253 while((field
=nextField())!=NULL
) {
254 if(!parseProperty(*props
, field
, newValues
, errorCode
)) { return NULL
; }
259 static const struct {
262 } ppucdProperties
[]={
263 { "Name_Alias", PPUCD_NAME_ALIAS
},
264 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS
},
265 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING
}
268 // Returns TRUE for "ok to continue parsing fields".
270 PreparsedUCD::parseProperty(UniProps
&props
, const char *field
, UnicodeSet
&newValues
,
271 UErrorCode
&errorCode
) {
274 const char *v
=strchr(p
, '=');
279 "error in preparsed UCD: mix of binary-property-no and "
280 "enum-property syntax '%s' on line %ld\n",
281 field
, (long)lineNumber
);
282 errorCode
=U_PARSE_ERROR
;
291 // Copy out the property name rather than modifying the field (writing a NUL).
292 pBuffer
.append(p
, (int32_t)(v
-p
), errorCode
);
296 int32_t prop
=pnames
->getPropertyEnum(p
);
298 for(int32_t i
=0;; ++i
) {
299 if(i
==UPRV_LENGTHOF(ppucdProperties
)) {
300 // Ignore unknown property names.
303 if(0==uprv_stricmp(p
, ppucdProperties
[i
].name
)) {
304 prop
=ppucdProperties
[i
].prop
;
310 if(prop
<UCHAR_BINARY_LIMIT
) {
312 props
.binProps
[prop
]=(UBool
)binaryValue
;
314 // No binary value for a binary property.
316 "error in preparsed UCD: enum-property syntax '%s' "
317 "for binary property on line %ld\n",
318 field
, (long)lineNumber
);
319 errorCode
=U_PARSE_ERROR
;
321 } else if(binaryValue
>=0) {
322 // Binary value for a non-binary property.
324 "error in preparsed UCD: binary-property syntax '%s' "
325 "for non-binary property on line %ld\n",
326 field
, (long)lineNumber
);
327 errorCode
=U_PARSE_ERROR
;
328 } else if (prop
< UCHAR_INT_START
) {
330 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
331 prop
, (long)lineNumber
);
332 errorCode
=U_PARSE_ERROR
;
333 } else if(prop
<UCHAR_INT_LIMIT
) {
334 int32_t value
=pnames
->getPropertyValueEnum(prop
, v
);
335 if(value
==UCHAR_INVALID_CODE
&& prop
==UCHAR_CANONICAL_COMBINING_CLASS
) {
336 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
338 unsigned long ccc
=uprv_strtoul(v
, &end
, 10);
339 if(v
<end
&& *end
==0 && ccc
<=254) {
343 if(value
==UCHAR_INVALID_CODE
) {
345 "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
346 field
, (long)lineNumber
);
347 errorCode
=U_PARSE_ERROR
;
349 props
.intProps
[prop
-UCHAR_INT_START
]=value
;
352 // Do not parse default values like <code point>, just set null values.
354 case UCHAR_BIDI_MIRRORING_GLYPH
:
355 props
.bmg
=U_SENTINEL
;
357 case UCHAR_BIDI_PAIRED_BRACKET
:
358 props
.bpb
=U_SENTINEL
;
360 case UCHAR_SIMPLE_CASE_FOLDING
:
361 props
.scf
=U_SENTINEL
;
363 case UCHAR_SIMPLE_LOWERCASE_MAPPING
:
364 props
.slc
=U_SENTINEL
;
366 case UCHAR_SIMPLE_TITLECASE_MAPPING
:
367 props
.stc
=U_SENTINEL
;
369 case UCHAR_SIMPLE_UPPERCASE_MAPPING
:
370 props
.suc
=U_SENTINEL
;
372 case UCHAR_CASE_FOLDING
:
375 case UCHAR_LOWERCASE_MAPPING
:
378 case UCHAR_TITLECASE_MAPPING
:
381 case UCHAR_UPPERCASE_MAPPING
:
384 case UCHAR_SCRIPT_EXTENSIONS
:
389 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
390 field
, (long)lineNumber
);
391 errorCode
=U_PARSE_ERROR
;
396 case UCHAR_NUMERIC_VALUE
:
397 props
.numericValue
=v
;
399 if('0'<=c
&& c
<='9' && v
[1]==0) {
400 props
.digitValue
=c
-'0';
409 u_versionFromString(props
.age
, v
); // Writes 0.0.0.0 if v is not numeric.
411 case UCHAR_BIDI_MIRRORING_GLYPH
:
412 props
.bmg
=parseCodePoint(v
, errorCode
);
414 case UCHAR_BIDI_PAIRED_BRACKET
:
415 props
.bpb
=parseCodePoint(v
, errorCode
);
417 case UCHAR_SIMPLE_CASE_FOLDING
:
418 props
.scf
=parseCodePoint(v
, errorCode
);
420 case UCHAR_SIMPLE_LOWERCASE_MAPPING
:
421 props
.slc
=parseCodePoint(v
, errorCode
);
423 case UCHAR_SIMPLE_TITLECASE_MAPPING
:
424 props
.stc
=parseCodePoint(v
, errorCode
);
426 case UCHAR_SIMPLE_UPPERCASE_MAPPING
:
427 props
.suc
=parseCodePoint(v
, errorCode
);
429 case UCHAR_CASE_FOLDING
:
430 parseString(v
, props
.cf
, errorCode
);
432 case UCHAR_LOWERCASE_MAPPING
:
433 parseString(v
, props
.lc
, errorCode
);
435 case UCHAR_TITLECASE_MAPPING
:
436 parseString(v
, props
.tc
, errorCode
);
438 case UCHAR_UPPERCASE_MAPPING
:
439 parseString(v
, props
.uc
, errorCode
);
441 case PPUCD_NAME_ALIAS
:
444 case PPUCD_CONDITIONAL_CASE_MAPPINGS
:
445 case PPUCD_TURKIC_CASE_FOLDING
:
446 // No need to parse their values: They are hardcoded in the runtime library.
448 case UCHAR_SCRIPT_EXTENSIONS
:
449 parseScriptExtensions(v
, props
.scx
, errorCode
);
452 // Ignore unhandled properties.
456 if(U_SUCCESS(errorCode
)) {
457 newValues
.add((UChar32
)prop
);
465 PreparsedUCD::getRangeForAlgNames(UChar32
&start
, UChar32
&end
, UErrorCode
&errorCode
) {
466 if(U_FAILURE(errorCode
)) { return FALSE
; }
467 if(lineType
!=ALG_NAMES_RANGE_LINE
) {
468 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
472 const char *field
=nextField();
474 // No range field after the type.
476 "error in preparsed UCD: missing algnamesrange range field "
477 "(no second field) on line %ld\n",
479 errorCode
=U_PARSE_ERROR
;
482 return parseCodePointRange(field
, start
, end
, errorCode
);
486 PreparsedUCD::parseCodePoint(const char *s
, UErrorCode
&errorCode
) {
488 uint32_t value
=(uint32_t)uprv_strtoul(s
, &end
, 16);
489 if(end
<=s
|| *end
!=0 || value
>=0x110000) {
491 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
492 s
, (long)lineNumber
);
493 errorCode
=U_PARSE_ERROR
;
496 return (UChar32
)value
;
500 PreparsedUCD::parseCodePointRange(const char *s
, UChar32
&start
, UChar32
&end
, UErrorCode
&errorCode
) {
502 u_parseCodePointRange(s
, &st
, &e
, &errorCode
);
503 if(U_FAILURE(errorCode
)) {
505 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
506 s
, (long)lineNumber
);
515 PreparsedUCD::parseString(const char *s
, UnicodeString
&uni
, UErrorCode
&errorCode
) {
516 UChar
*buffer
=uni
.getBuffer(-1);
517 int32_t length
=u_parseString(s
, buffer
, uni
.getCapacity(), NULL
, &errorCode
);
518 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
519 errorCode
=U_ZERO_ERROR
;
520 uni
.releaseBuffer(0);
521 buffer
=uni
.getBuffer(length
);
522 length
=u_parseString(s
, buffer
, uni
.getCapacity(), NULL
, &errorCode
);
524 uni
.releaseBuffer(length
);
525 if(U_FAILURE(errorCode
)) {
527 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
528 s
, (long)lineNumber
);
533 PreparsedUCD::parseScriptExtensions(const char *s
, UnicodeSet
&scx
, UErrorCode
&errorCode
) {
534 if(U_FAILURE(errorCode
)) { return; }
539 const char *scLimit
=strchr(s
, ' ');
541 scs
=scString
.clear().append(s
, (int32_t)(scLimit
-s
), errorCode
).data();
542 if(U_FAILURE(errorCode
)) { return; }
546 int32_t script
=pnames
->getPropertyValueEnum(UCHAR_SCRIPT
, scs
);
547 if(script
==UCHAR_INVALID_CODE
) {
549 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
550 scs
, (long)lineNumber
);
551 errorCode
=U_PARSE_ERROR
;
553 } else if(scx
.contains(script
)) {
555 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
556 scs
, (long)lineNumber
);
557 errorCode
=U_PARSE_ERROR
;
569 fprintf(stderr
, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber
);
570 errorCode
=U_PARSE_ERROR
;