]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/toolutil/ppucd.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / ppucd.cpp
CommitLineData
4388f060
A
1/*
2*******************************************************************************
3* Copyright (C) 2011-2012, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* file name: ppucd.cpp
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2011dec11
12* created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "unicode/uchar.h"
17#include "charstr.h"
18#include "cstring.h"
19#include "ppucd.h"
20#include "uassert.h"
21#include "uparse.h"
22
23#include <stdio.h>
24#include <string.h>
25
26#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
27
28U_NAMESPACE_BEGIN
29
30PropertyNames::~PropertyNames() {}
31
32int32_t
33PropertyNames::getPropertyEnum(const char *name) const {
34 return u_getPropertyEnum(name);
35}
36
37int32_t
38PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
39 return u_getPropertyValueEnum((UProperty)property, name);
40}
41
42UniProps::UniProps()
43 : start(U_SENTINEL), end(U_SENTINEL),
44 bmg(U_SENTINEL),
45 scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
46 digitValue(-1), numericValue(NULL),
47 name(NULL), nameAlias(NULL) {
48 memset(binProps, 0, sizeof(binProps));
49 memset(intProps, 0, sizeof(intProps));
50 memset(age, 0, 4);
51}
52
53UniProps::~UniProps() {}
54
55const int32_t PreparsedUCD::kNumLineBuffers;
56
57PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
58 : icuPnames(new PropertyNames()), pnames(icuPnames),
59 file(NULL),
60 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
61 lineNumber(0),
62 lineType(NO_LINE),
63 fieldLimit(NULL), lineLimit(NULL) {
64 if(U_FAILURE(errorCode)) { return; }
65
66 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
67 filename=NULL;
68 file=stdin;
69 } else {
70 file=fopen(filename, "r");
71 }
72 if(file==NULL) {
73 perror("error opening preparsed UCD");
74 fprintf(stderr, "error opening preparsed UCD file %s\n", filename);
75 errorCode=U_FILE_ACCESS_ERROR;
76 return;
77 }
78
79 memset(ucdVersion, 0, 4);
80 lines[0][0]=0;
81}
82
83PreparsedUCD::~PreparsedUCD() {
84 if(file!=stdin) {
85 fclose(file);
86 }
87 delete icuPnames;
88}
89
90// Same order as the LineType values.
91static const char *lineTypeStrings[]={
92 NULL,
93 NULL,
94 "ucd",
95 "property",
96 "binary",
97 "value",
98 "defaults",
99 "block",
100 "cp",
101 "algnamesrange"
102};
103
104PreparsedUCD::LineType
105PreparsedUCD::readLine(UErrorCode &errorCode) {
106 if(U_FAILURE(errorCode)) { return NO_LINE; }
107 // Select the next available line buffer.
108 while(!isLineBufferAvailable(lineIndex)) {
109 ++lineIndex;
110 if (lineIndex == kNumLineBuffers) {
111 lineIndex = 0;
112 }
113 }
114 char *line=lines[lineIndex];
115 *line=0;
116 lineLimit=fieldLimit=line;
117 lineType=NO_LINE;
118 char *result=fgets(line, sizeof(lines[0]), file);
119 if(result==NULL) {
120 if(ferror(file)) {
121 perror("error reading preparsed UCD");
122 fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
123 errorCode=U_FILE_ACCESS_ERROR;
124 }
125 return NO_LINE;
126 }
127 ++lineNumber;
128 if(*line=='#') {
129 fieldLimit=strchr(line, 0);
130 return lineType=EMPTY_LINE;
131 }
132 // Remove trailing /r/n.
133 char c;
134 char *limit=strchr(line, 0);
135 while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
136 // Remove trailing white space.
137 while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
138 *limit=0;
139 lineLimit=limit;
140 if(line==limit) {
141 fieldLimit=limit;
142 return lineType=EMPTY_LINE;
143 }
144 // Split by ';'.
145 char *semi=line;
146 while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
147 fieldLimit=strchr(line, 0);
148 // Determine the line type.
149 int32_t type;
150 for(type=EMPTY_LINE+1;; ++type) {
151 if(type==LINE_TYPE_COUNT) {
152 fprintf(stderr,
153 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
154 line, (long)lineNumber);
155 errorCode=U_PARSE_ERROR;
156 return NO_LINE;
157 }
158 if(0==strcmp(line, lineTypeStrings[type])) {
159 break;
160 }
161 }
162 lineType=(LineType)type;
163 if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
164 u_versionFromString(ucdVersion, fieldLimit+1);
165 }
166 return lineType;
167}
168
169const char *
170PreparsedUCD::firstField() {
171 char *field=lines[lineIndex];
172 fieldLimit=strchr(field, 0);
173 return field;
174}
175
176const char *
177PreparsedUCD::nextField() {
178 if(fieldLimit==lineLimit) { return NULL; }
179 char *field=fieldLimit+1;
180 fieldLimit=strchr(field, 0);
181 return field;
182}
183
184const UniProps *
185PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
186 if(U_FAILURE(errorCode)) { return NULL; }
187 newValues.clear();
188 if(!lineHasPropertyValues()) {
189 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
190 return NULL;
191 }
192 firstField();
193 const char *field=nextField();
194 if(field==NULL) {
195 // No range field after the type.
196 fprintf(stderr,
197 "error in preparsed UCD: missing default/block/cp range field "
198 "(no second field) on line %ld\n",
199 (long)lineNumber);
200 errorCode=U_PARSE_ERROR;
201 return NULL;
202 }
203 UChar32 start, end;
204 if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
205 UniProps *props;
206 switch(lineType) {
207 case DEFAULTS_LINE:
208 if(defaultLineIndex>=0) {
209 fprintf(stderr,
210 "error in preparsed UCD: second line with default properties on line %ld\n",
211 (long)lineNumber);
212 errorCode=U_PARSE_ERROR;
213 return NULL;
214 }
215 if(start!=0 || end!=0x10ffff) {
216 fprintf(stderr,
217 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
218 field, (long)lineNumber);
219 errorCode=U_PARSE_ERROR;
220 return NULL;
221 }
222 props=&defaultProps;
223 defaultLineIndex=lineIndex;
224 break;
225 case BLOCK_LINE:
226 blockProps=defaultProps; // Block inherits default properties.
227 props=&blockProps;
228 blockLineIndex=lineIndex;
229 break;
230 case CP_LINE:
231 if(blockProps.start<=start && end<=blockProps.end) {
232 // Code point range fully inside the last block inherits the block properties.
233 cpProps=blockProps;
234 } else if(start>blockProps.end || end<blockProps.start) {
235 // Code point range fully outside the last block inherits the default properties.
236 cpProps=defaultProps;
237 } else {
238 // Code point range partially overlapping with the last block is illegal.
239 fprintf(stderr,
240 "error in preparsed UCD: cp range %s on line %ld only "
241 "partially overlaps with block range %04lX..%04lX\n",
242 field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
243 errorCode=U_PARSE_ERROR;
244 return NULL;
245 }
246 props=&cpProps;
247 break;
248 default:
249 // Will not occur because of the range check above.
250 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
251 return NULL;
252 }
253 props->start=start;
254 props->end=end;
255 while((field=nextField())!=NULL) {
256 if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
257 }
258 return props;
259}
260
261static const struct {
262 const char *name;
263 int32_t prop;
264} ppucdProperties[]={
265 { "Name_Alias", PPUCD_NAME_ALIAS },
266 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
267 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
268};
269
270// Returns TRUE for "ok to continue parsing fields".
271UBool
272PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
273 UErrorCode &errorCode) {
274 CharString pBuffer;
275 const char *p=field;
276 const char *v=strchr(p, '=');
277 int binaryValue;
278 if(*p=='-') {
279 if(v!=NULL) {
280 fprintf(stderr,
281 "error in preparsed UCD: mix of binary-property-no and "
282 "enum-property syntax '%s' on line %ld\n",
283 field, (long)lineNumber);
284 errorCode=U_PARSE_ERROR;
285 return FALSE;
286 }
287 binaryValue=0;
288 ++p;
289 } else if(v==NULL) {
290 binaryValue=1;
291 } else {
292 binaryValue=-1;
293 // Copy out the property name rather than modifying the field (writing a NUL).
294 pBuffer.append(p, (int32_t)(v-p), errorCode);
295 p=pBuffer.data();
296 ++v;
297 }
298 int32_t prop=pnames->getPropertyEnum(p);
299 if(prop<0) {
300 for(int32_t i=0;; ++i) {
301 if(i==LENGTHOF(ppucdProperties)) {
302 // Ignore unknown property names.
303 return TRUE;
304 }
305 if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
306 prop=ppucdProperties[i].prop;
307 U_ASSERT(prop>=0);
308 break;
309 }
310 }
311 }
312 if(prop<UCHAR_BINARY_LIMIT) {
313 if(binaryValue>=0) {
314 props.binProps[prop]=(UBool)binaryValue;
315 } else {
316 // No binary value for a binary property.
317 fprintf(stderr,
318 "error in preparsed UCD: enum-property syntax '%s' "
319 "for binary property on line %ld\n",
320 field, (long)lineNumber);
321 errorCode=U_PARSE_ERROR;
322 }
323 } else if(binaryValue>=0) {
324 // Binary value for a non-binary property.
325 fprintf(stderr,
326 "error in preparsed UCD: binary-property syntax '%s' "
327 "for non-binary property on line %ld\n",
328 field, (long)lineNumber);
329 errorCode=U_PARSE_ERROR;
330 } else if(prop<UCHAR_INT_LIMIT) {
331 int32_t value=pnames->getPropertyValueEnum(prop, v);
332 if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
333 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
334 char *end;
335 unsigned long ccc=uprv_strtoul(v, &end, 10);
336 if(v<end && *end==0 && ccc<=254) {
337 value=(int32_t)ccc;
338 }
339 }
340 if(value==UCHAR_INVALID_CODE) {
341 fprintf(stderr,
342 "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
343 field, (long)lineNumber);
344 errorCode=U_PARSE_ERROR;
345 } else {
346 props.intProps[prop-UCHAR_INT_START]=value;
347 }
348 } else if(*v=='<') {
349 // Do not parse default values like <code point>, just set null values.
350 switch(prop) {
351 case UCHAR_BIDI_MIRRORING_GLYPH:
352 props.bmg=U_SENTINEL;
353 break;
354 case UCHAR_SIMPLE_CASE_FOLDING:
355 props.scf=U_SENTINEL;
356 break;
357 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
358 props.slc=U_SENTINEL;
359 break;
360 case UCHAR_SIMPLE_TITLECASE_MAPPING:
361 props.stc=U_SENTINEL;
362 break;
363 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
364 props.suc=U_SENTINEL;
365 break;
366 case UCHAR_CASE_FOLDING:
367 props.cf.remove();
368 break;
369 case UCHAR_LOWERCASE_MAPPING:
370 props.lc.remove();
371 break;
372 case UCHAR_TITLECASE_MAPPING:
373 props.tc.remove();
374 break;
375 case UCHAR_UPPERCASE_MAPPING:
376 props.uc.remove();
377 break;
378 case UCHAR_SCRIPT_EXTENSIONS:
379 props.scx.clear();
380 break;
381 default:
382 fprintf(stderr,
383 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
384 field, (long)lineNumber);
385 errorCode=U_PARSE_ERROR;
386 }
387 } else {
388 char c;
389 switch(prop) {
390 case UCHAR_NUMERIC_VALUE:
391 props.numericValue=v;
392 c=*v;
393 if('0'<=c && c<='9' && v[1]==0) {
394 props.digitValue=c-'0';
395 } else {
396 props.digitValue=-1;
397 }
398 break;
399 case UCHAR_NAME:
400 props.name=v;
401 break;
402 case UCHAR_AGE:
403 u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric.
404 break;
405 case UCHAR_BIDI_MIRRORING_GLYPH:
406 props.bmg=parseCodePoint(v, errorCode);
407 break;
408 case UCHAR_SIMPLE_CASE_FOLDING:
409 props.scf=parseCodePoint(v, errorCode);
410 break;
411 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
412 props.slc=parseCodePoint(v, errorCode);
413 break;
414 case UCHAR_SIMPLE_TITLECASE_MAPPING:
415 props.stc=parseCodePoint(v, errorCode);
416 break;
417 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
418 props.suc=parseCodePoint(v, errorCode);
419 break;
420 case UCHAR_CASE_FOLDING:
421 parseString(v, props.cf, errorCode);
422 break;
423 case UCHAR_LOWERCASE_MAPPING:
424 parseString(v, props.lc, errorCode);
425 break;
426 case UCHAR_TITLECASE_MAPPING:
427 parseString(v, props.tc, errorCode);
428 break;
429 case UCHAR_UPPERCASE_MAPPING:
430 parseString(v, props.uc, errorCode);
431 break;
432 case PPUCD_NAME_ALIAS:
433 props.nameAlias=v;
434 break;
435 case PPUCD_CONDITIONAL_CASE_MAPPINGS:
436 case PPUCD_TURKIC_CASE_FOLDING:
437 // No need to parse their values: They are hardcoded in the runtime library.
438 break;
439 case UCHAR_SCRIPT_EXTENSIONS:
440 parseScriptExtensions(v, props.scx, errorCode);
441 break;
442 default:
443 // Ignore unhandled properties.
444 return TRUE;
445 }
446 }
447 if(U_SUCCESS(errorCode)) {
448 newValues.add((UChar32)prop);
449 return TRUE;
450 } else {
451 return FALSE;
452 }
453}
454
455UBool
456PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
457 if(U_FAILURE(errorCode)) { return FALSE; }
458 if(lineType!=ALG_NAMES_RANGE_LINE) {
459 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
460 return FALSE;
461 }
462 firstField();
463 const char *field=nextField();
464 if(field==NULL) {
465 // No range field after the type.
466 fprintf(stderr,
467 "error in preparsed UCD: missing algnamesrange range field "
468 "(no second field) on line %ld\n",
469 (long)lineNumber);
470 errorCode=U_PARSE_ERROR;
471 return FALSE;
472 }
473 return parseCodePointRange(field, start, end, errorCode);
474}
475
476UChar32
477PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
478 char *end;
479 uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
480 if(end<=s || *end!=0 || value>=0x110000) {
481 fprintf(stderr,
482 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
483 s, (long)lineNumber);
484 errorCode=U_PARSE_ERROR;
485 return U_SENTINEL;
486 }
487 return (UChar32)value;
488}
489
490UBool
491PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
492 uint32_t st, e;
493 u_parseCodePointRange(s, &st, &e, &errorCode);
494 if(U_FAILURE(errorCode)) {
495 fprintf(stderr,
496 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
497 s, (long)lineNumber);
498 return FALSE;
499 }
500 start=(UChar32)st;
501 end=(UChar32)e;
502 return TRUE;
503}
504
505void
506PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
507 UChar *buffer=uni.getBuffer(-1);
508 int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
509 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
510 errorCode=U_ZERO_ERROR;
511 uni.releaseBuffer(0);
512 buffer=uni.getBuffer(length);
513 length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
514 }
515 uni.releaseBuffer(length);
516 if(U_FAILURE(errorCode)) {
517 fprintf(stderr,
518 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
519 s, (long)lineNumber);
520 }
521}
522
523void
524PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
525 if(U_FAILURE(errorCode)) { return; }
526 scx.clear();
527 CharString scString;
528 for(;;) {
529 const char *scs;
530 const char *scLimit=strchr(s, ' ');
531 if(scLimit!=NULL) {
532 scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
533 if(U_FAILURE(errorCode)) { return; }
534 } else {
535 scs=s;
536 }
537 int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
538 if(script==UCHAR_INVALID_CODE) {
539 fprintf(stderr,
540 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
541 scs, (long)lineNumber);
542 errorCode=U_PARSE_ERROR;
543 return;
544 } else if(scx.contains(script)) {
545 fprintf(stderr,
546 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
547 scs, (long)lineNumber);
548 errorCode=U_PARSE_ERROR;
549 return;
550 } else {
551 scx.add(script);
552 }
553 if(scLimit!=NULL) {
554 s=scLimit+1;
555 } else {
556 break;
557 }
558 }
559 if(scx.isEmpty()) {
560 fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
561 errorCode=U_PARSE_ERROR;
562 }
563}
564
565U_NAMESPACE_END