]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
73c04bcf | 4 | * Copyright (C) 2004-2005, International Business Machines |
374ca955 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: gencase.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004aug28 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This program reads several of the Unicode character database text files, | |
17 | * parses them, and the case mapping properties for each character. | |
18 | * It then writes a binary file containing the properties | |
19 | * that is designed to be used directly for random-access to | |
20 | * the properties of each Unicode character. | |
21 | */ | |
22 | ||
23 | #include <stdio.h> | |
24 | #include "unicode/utypes.h" | |
25 | #include "unicode/uchar.h" | |
26 | #include "unicode/uset.h" | |
27 | #include "unicode/putil.h" | |
28 | #include "unicode/uclean.h" | |
29 | #include "cmemory.h" | |
30 | #include "cstring.h" | |
31 | #include "uarrsort.h" | |
32 | #include "unewdata.h" | |
33 | #include "uoptions.h" | |
34 | #include "uparse.h" | |
35 | #include "uprops.h" | |
36 | #include "propsvec.h" | |
37 | #include "gencase.h" | |
38 | ||
39 | #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) | |
40 | ||
41 | /* data --------------------------------------------------------------------- */ | |
42 | ||
43 | uint32_t *pv; | |
44 | ||
45 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
46 | ||
47 | /* | |
48 | * Unicode set collecting the case-sensitive characters; | |
49 | * see uchar.h UCHAR_CASE_SENSITIVE. | |
50 | * Add code points from case mappings/foldings in | |
51 | * the root locale and with default options. | |
52 | */ | |
53 | static USet *caseSensitive; | |
54 | ||
55 | /* prototypes --------------------------------------------------------------- */ | |
56 | ||
57 | static void | |
58 | parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); | |
59 | ||
60 | static void | |
61 | parseCaseFolding(const char *filename, UErrorCode *pErrorCode); | |
62 | ||
63 | static void | |
64 | parseDB(const char *filename, UErrorCode *pErrorCode); | |
65 | ||
66 | /* parse files with multiple binary properties ------------------------------ */ | |
67 | ||
68 | /* TODO: more common code, move functions to uparse.h|c */ | |
69 | ||
70 | /* TODO: similar to genprops/props2.c but not the same */ | |
71 | ||
72 | struct Binary { | |
73 | const char *propName; | |
74 | int32_t vecWord; | |
75 | uint32_t vecValue, vecMask; | |
76 | }; | |
77 | typedef struct Binary Binary; | |
78 | ||
79 | struct Binaries { | |
80 | const char *ucdFile; | |
81 | const Binary *binaries; | |
82 | int32_t binariesCount; | |
83 | }; | |
84 | typedef struct Binaries Binaries; | |
85 | ||
86 | static const Binary | |
87 | propListNames[]={ | |
88 | { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK } | |
89 | }; | |
90 | ||
91 | static const Binaries | |
92 | propListBinaries={ | |
93 | "PropList", propListNames, LENGTHOF(propListNames) | |
94 | }; | |
95 | ||
96 | static const Binary | |
97 | derCorePropsNames[]={ | |
98 | { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK }, | |
99 | { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK } | |
100 | }; | |
101 | ||
102 | static const Binaries | |
103 | derCorePropsBinaries={ | |
104 | "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) | |
105 | }; | |
106 | ||
73c04bcf A |
107 | /* treat Word_Break=MidLetter as a binary property (we ignore all other Word_Break values) */ |
108 | static const Binary | |
109 | wordBreakNames[]={ | |
110 | { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) } | |
111 | }; | |
112 | ||
113 | static const Binaries | |
114 | wordBreakBinaries={ | |
115 | "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames) | |
116 | }; | |
117 | ||
374ca955 A |
118 | static void U_CALLCONV |
119 | binariesLineFn(void *context, | |
120 | char *fields[][2], int32_t fieldCount, | |
121 | UErrorCode *pErrorCode) { | |
122 | const Binaries *bin; | |
123 | char *s; | |
124 | uint32_t start, limit; | |
125 | int32_t i; | |
126 | ||
127 | bin=(const Binaries *)context; | |
128 | ||
129 | u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode); | |
130 | if(U_FAILURE(*pErrorCode)) { | |
131 | fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); | |
132 | exit(*pErrorCode); | |
133 | } | |
134 | ++limit; | |
135 | ||
136 | /* parse binary property name */ | |
137 | s=(char *)u_skipWhitespace(fields[1][0]); | |
138 | for(i=0;; ++i) { | |
139 | if(i==bin->binariesCount) { | |
140 | /* ignore unrecognized properties */ | |
141 | return; | |
142 | } | |
143 | if(isToken(bin->binaries[i].propName, s)) { | |
144 | break; | |
145 | } | |
146 | } | |
147 | ||
148 | if(bin->binaries[i].vecMask==0) { | |
149 | fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n", | |
150 | (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); | |
151 | exit(U_INTERNAL_PROGRAM_ERROR); | |
152 | } | |
153 | ||
154 | if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) { | |
155 | fprintf(stderr, "gencase error: unable to set %s, code: %s\n", | |
156 | bin->binaries[i].propName, u_errorName(*pErrorCode)); | |
157 | exit(*pErrorCode); | |
158 | } | |
159 | } | |
160 | ||
161 | static void | |
162 | parseBinariesFile(char *filename, char *basename, const char *suffix, | |
163 | const Binaries *bin, | |
164 | UErrorCode *pErrorCode) { | |
165 | char *fields[2][2]; | |
166 | ||
167 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
168 | return; | |
169 | } | |
170 | ||
171 | writeUCDFilename(basename, bin->ucdFile, suffix); | |
172 | ||
173 | u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); | |
174 | if(U_FAILURE(*pErrorCode)) { | |
175 | fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); | |
176 | } | |
177 | } | |
178 | ||
179 | /* -------------------------------------------------------------------------- */ | |
180 | ||
181 | enum | |
182 | { | |
183 | HELP_H, | |
184 | HELP_QUESTION_MARK, | |
185 | VERBOSE, | |
186 | COPYRIGHT, | |
187 | DESTDIR, | |
188 | SOURCEDIR, | |
189 | UNICODE_VERSION, | |
73c04bcf A |
190 | ICUDATADIR, |
191 | CSOURCE | |
374ca955 A |
192 | }; |
193 | ||
194 | /* Keep these values in sync with the above enums */ | |
195 | static UOption options[]={ | |
196 | UOPTION_HELP_H, | |
197 | UOPTION_HELP_QUESTION_MARK, | |
198 | UOPTION_VERBOSE, | |
199 | UOPTION_COPYRIGHT, | |
200 | UOPTION_DESTDIR, | |
201 | UOPTION_SOURCEDIR, | |
73c04bcf A |
202 | UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), |
203 | UOPTION_ICUDATADIR, | |
204 | UOPTION_DEF("csource", 'C', UOPT_NO_ARG) | |
374ca955 A |
205 | }; |
206 | ||
207 | extern int | |
208 | main(int argc, char* argv[]) { | |
209 | char filename[300]; | |
210 | const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; | |
211 | char *basename=NULL; | |
212 | UErrorCode errorCode=U_ZERO_ERROR; | |
213 | ||
214 | U_MAIN_INIT_ARGS(argc, argv); | |
215 | ||
216 | /* preset then read command line options */ | |
217 | options[DESTDIR].value=u_getDataDirectory(); | |
218 | options[SOURCEDIR].value=""; | |
219 | options[UNICODE_VERSION].value=""; | |
220 | options[ICUDATADIR].value=u_getDataDirectory(); | |
221 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); | |
222 | ||
223 | /* error handling, printing usage message */ | |
224 | if(argc<0) { | |
225 | fprintf(stderr, | |
226 | "error in command line argument \"%s\"\n", | |
227 | argv[-argc]); | |
228 | } | |
229 | if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { | |
230 | /* | |
231 | * Broken into chucks because the C89 standard says the minimum | |
232 | * required supported string length is 509 bytes. | |
233 | */ | |
234 | fprintf(stderr, | |
235 | "Usage: %s [-options] [suffix]\n" | |
236 | "\n" | |
237 | "read the UnicodeData.txt file and other Unicode properties files and\n" | |
238 | "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n" | |
239 | "\n", | |
240 | argv[0]); | |
241 | fprintf(stderr, | |
242 | "Options:\n" | |
243 | "\t-h or -? or --help this usage text\n" | |
244 | "\t-v or --verbose verbose output\n" | |
245 | "\t-c or --copyright include a copyright notice\n" | |
73c04bcf A |
246 | "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" |
247 | "\t-C or --csource generate a .c source file rather than the .icu binary\n"); | |
374ca955 A |
248 | fprintf(stderr, |
249 | "\t-d or --destdir destination directory, followed by the path\n" | |
250 | "\t-s or --sourcedir source directory, followed by the path\n" | |
251 | "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" | |
252 | "\t followed by path, defaults to %s\n" | |
253 | "\tsuffix suffix that is to be appended with a '-'\n" | |
254 | "\t to the source file basenames before opening;\n" | |
255 | "\t 'gencase new' will read UnicodeData-new.txt etc.\n", | |
256 | u_getDataDirectory()); | |
257 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
258 | } | |
259 | ||
260 | /* get the options values */ | |
261 | beVerbose=options[VERBOSE].doesOccur; | |
262 | haveCopyright=options[COPYRIGHT].doesOccur; | |
263 | srcDir=options[SOURCEDIR].value; | |
264 | destDir=options[DESTDIR].value; | |
265 | ||
266 | if(argc>=2) { | |
267 | suffix=argv[1]; | |
268 | } else { | |
269 | suffix=NULL; | |
270 | } | |
271 | ||
272 | if(options[UNICODE_VERSION].doesOccur) { | |
273 | setUnicodeVersion(options[UNICODE_VERSION].value); | |
274 | } | |
275 | /* else use the default dataVersion in store.c */ | |
276 | ||
277 | if (options[ICUDATADIR].doesOccur) { | |
278 | u_setDataDirectory(options[ICUDATADIR].value); | |
279 | } | |
280 | ||
281 | /* prepare the filename beginning with the source dir */ | |
282 | uprv_strcpy(filename, srcDir); | |
283 | basename=filename+uprv_strlen(filename); | |
284 | if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { | |
285 | *basename++=U_FILE_SEP_CHAR; | |
286 | } | |
287 | ||
288 | /* initialize */ | |
73c04bcf | 289 | pv=upvec_open(2, 10000); |
374ca955 A |
290 | caseSensitive=uset_open(1, 0); /* empty set (start>end) */ |
291 | ||
292 | /* process SpecialCasing.txt */ | |
293 | writeUCDFilename(basename, "SpecialCasing", suffix); | |
294 | parseSpecialCasing(filename, &errorCode); | |
295 | ||
296 | /* process CaseFolding.txt */ | |
297 | writeUCDFilename(basename, "CaseFolding", suffix); | |
298 | parseCaseFolding(filename, &errorCode); | |
299 | ||
300 | /* process additional properties files */ | |
301 | *basename=0; | |
302 | ||
303 | parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode); | |
304 | ||
305 | parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode); | |
306 | ||
73c04bcf A |
307 | if(ucdVersion>=UNI_4_1) { |
308 | parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode); | |
309 | } | |
310 | ||
374ca955 A |
311 | /* process UnicodeData.txt */ |
312 | writeUCDFilename(basename, "UnicodeData", suffix); | |
313 | parseDB(filename, &errorCode); | |
314 | ||
315 | /* process parsed data */ | |
316 | makeCaseClosure(); | |
317 | ||
318 | makeExceptions(); | |
319 | ||
320 | if(U_SUCCESS(errorCode)) { | |
321 | /* write the properties data file */ | |
73c04bcf | 322 | generateData(destDir, options[CSOURCE].doesOccur); |
374ca955 A |
323 | } |
324 | ||
325 | u_cleanup(); | |
326 | return errorCode; | |
327 | } | |
328 | ||
329 | U_CFUNC void | |
330 | writeUCDFilename(char *basename, const char *filename, const char *suffix) { | |
331 | int32_t length=(int32_t)uprv_strlen(filename); | |
332 | uprv_strcpy(basename, filename); | |
333 | if(suffix!=NULL) { | |
334 | basename[length++]='-'; | |
335 | uprv_strcpy(basename+length, suffix); | |
336 | length+=(int32_t)uprv_strlen(suffix); | |
337 | } | |
338 | uprv_strcpy(basename+length, ".txt"); | |
339 | } | |
340 | ||
341 | /* TODO: move to toolutil */ | |
342 | U_CFUNC UBool | |
343 | isToken(const char *token, const char *s) { | |
344 | const char *z; | |
345 | int32_t j; | |
346 | ||
347 | s=u_skipWhitespace(s); | |
348 | for(j=0;; ++j) { | |
349 | if(token[j]!=0) { | |
350 | if(s[j]!=token[j]) { | |
351 | break; | |
352 | } | |
353 | } else { | |
354 | z=u_skipWhitespace(s+j); | |
355 | if(*z==';' || *z==0) { | |
356 | return TRUE; | |
357 | } else { | |
358 | break; | |
359 | } | |
360 | } | |
361 | } | |
362 | ||
363 | return FALSE; | |
364 | } | |
365 | ||
366 | static int32_t | |
367 | getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { | |
368 | const char *t, *z; | |
369 | int32_t i, j; | |
370 | ||
371 | s=u_skipWhitespace(s); | |
372 | for(i=0; i<countTokens; ++i) { | |
373 | t=tokens[i]; | |
374 | if(t!=NULL) { | |
375 | for(j=0;; ++j) { | |
376 | if(t[j]!=0) { | |
377 | if(s[j]!=t[j]) { | |
378 | break; | |
379 | } | |
380 | } else { | |
381 | z=u_skipWhitespace(s+j); | |
382 | if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') { | |
383 | return i; | |
384 | } else { | |
385 | break; | |
386 | } | |
387 | } | |
388 | } | |
389 | } | |
390 | } | |
391 | return -1; | |
392 | } | |
393 | ||
394 | static void | |
395 | _set_addAll(USet *set, const UChar *s, int32_t length) { | |
396 | UChar32 c; | |
397 | int32_t i; | |
398 | ||
399 | /* needs length>=0 */ | |
400 | for(i=0; i<length; /* U16_NEXT advances i */) { | |
401 | U16_NEXT(s, i, length, c); | |
402 | uset_add(set, c); | |
403 | } | |
404 | } | |
405 | ||
406 | /* parser for SpecialCasing.txt --------------------------------------------- */ | |
407 | ||
408 | #define MAX_SPECIAL_CASING_COUNT 500 | |
409 | ||
410 | static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT]; | |
411 | static int32_t specialCasingCount=0; | |
412 | ||
413 | static void U_CALLCONV | |
414 | specialCasingLineFn(void *context, | |
415 | char *fields[][2], int32_t fieldCount, | |
416 | UErrorCode *pErrorCode) { | |
417 | char *end; | |
418 | ||
419 | /* get code point */ | |
420 | specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); | |
421 | end=(char *)u_skipWhitespace(end); | |
422 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
423 | fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); | |
424 | *pErrorCode=U_PARSE_ERROR; | |
425 | exit(U_PARSE_ERROR); | |
426 | } | |
427 | ||
428 | /* is this a complex mapping? */ | |
429 | if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { | |
430 | /* there is some condition text in the fifth field */ | |
431 | specialCasings[specialCasingCount].isComplex=TRUE; | |
432 | ||
433 | /* do not store any actual mappings for this */ | |
434 | specialCasings[specialCasingCount].lowerCase[0]=0; | |
435 | specialCasings[specialCasingCount].upperCase[0]=0; | |
436 | specialCasings[specialCasingCount].titleCase[0]=0; | |
437 | } else { | |
438 | /* just set the "complex" flag and get the case mappings */ | |
439 | specialCasings[specialCasingCount].isComplex=FALSE; | |
440 | specialCasings[specialCasingCount].lowerCase[0]= | |
441 | (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); | |
442 | specialCasings[specialCasingCount].upperCase[0]= | |
443 | (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); | |
444 | specialCasings[specialCasingCount].titleCase[0]= | |
445 | (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); | |
446 | if(U_FAILURE(*pErrorCode)) { | |
447 | fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]); | |
448 | exit(*pErrorCode); | |
449 | } | |
450 | ||
451 | uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); | |
452 | _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); | |
453 | _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); | |
454 | _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); | |
455 | } | |
456 | ||
457 | if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { | |
458 | fprintf(stderr, "gencase: too many special casing mappings\n"); | |
459 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
460 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
461 | } | |
462 | } | |
463 | ||
464 | static int32_t U_CALLCONV | |
465 | compareSpecialCasings(const void *context, const void *left, const void *right) { | |
466 | return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code; | |
467 | } | |
468 | ||
469 | static void | |
470 | parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { | |
471 | char *fields[5][2]; | |
472 | int32_t i, j; | |
473 | ||
474 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
475 | return; | |
476 | } | |
477 | ||
478 | u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); | |
479 | ||
480 | /* sort the special casing entries by code point */ | |
481 | if(specialCasingCount>0) { | |
482 | uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), | |
483 | compareSpecialCasings, NULL, FALSE, pErrorCode); | |
484 | } | |
485 | if(U_FAILURE(*pErrorCode)) { | |
486 | return; | |
487 | } | |
488 | ||
489 | /* replace multiple entries for any code point by one "complex" one */ | |
490 | j=0; | |
491 | for(i=1; i<specialCasingCount; ++i) { | |
492 | if(specialCasings[i-1].code==specialCasings[i].code) { | |
493 | /* there is a duplicate code point */ | |
494 | specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */ | |
495 | specialCasings[i].isComplex=TRUE; /* make the following one complex */ | |
496 | specialCasings[i].lowerCase[0]=0; | |
497 | specialCasings[i].upperCase[0]=0; | |
498 | specialCasings[i].titleCase[0]=0; | |
499 | ++j; | |
500 | } | |
501 | } | |
502 | ||
503 | /* if some entries just were removed, then re-sort */ | |
504 | if(j>0) { | |
505 | uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), | |
506 | compareSpecialCasings, NULL, FALSE, pErrorCode); | |
507 | specialCasingCount-=j; | |
508 | } | |
509 | if(U_FAILURE(*pErrorCode)) { | |
510 | return; | |
511 | } | |
512 | ||
513 | /* | |
514 | * Add one complex mapping to caseSensitive that was filtered out above: | |
515 | * Greek final Sigma has a conditional mapping but not locale-sensitive, | |
516 | * and it is taken when lowercasing just U+03A3 alone. | |
517 | * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA | |
518 | */ | |
519 | uset_add(caseSensitive, 0x3c2); | |
520 | } | |
521 | ||
522 | /* parser for CaseFolding.txt ----------------------------------------------- */ | |
523 | ||
524 | #define MAX_CASE_FOLDING_COUNT 2000 | |
525 | ||
526 | static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; | |
527 | static int32_t caseFoldingCount=0; | |
528 | ||
529 | static void U_CALLCONV | |
530 | caseFoldingLineFn(void *context, | |
531 | char *fields[][2], int32_t fieldCount, | |
532 | UErrorCode *pErrorCode) { | |
533 | char *end; | |
534 | static UChar32 prevCode=0; | |
535 | int32_t count; | |
536 | char status; | |
537 | ||
538 | /* get code point */ | |
539 | caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); | |
540 | end=(char *)u_skipWhitespace(end); | |
541 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
542 | fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); | |
543 | *pErrorCode=U_PARSE_ERROR; | |
544 | exit(U_PARSE_ERROR); | |
545 | } | |
546 | ||
547 | /* get the status of this mapping */ | |
548 | caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); | |
549 | if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { | |
550 | fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); | |
551 | *pErrorCode=U_PARSE_ERROR; | |
552 | exit(U_PARSE_ERROR); | |
553 | } | |
554 | ||
555 | /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ | |
556 | if(status=='L') { | |
557 | return; | |
558 | } | |
559 | ||
560 | /* get the mapping */ | |
561 | count=caseFoldings[caseFoldingCount].full[0]= | |
562 | (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); | |
563 | if(U_FAILURE(*pErrorCode)) { | |
564 | fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); | |
565 | exit(*pErrorCode); | |
566 | } | |
567 | ||
568 | /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ | |
569 | if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { | |
570 | caseFoldings[caseFoldingCount].simple=0; | |
571 | } | |
572 | ||
573 | /* update the case-sensitive set */ | |
574 | if(status!='T') { | |
575 | uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); | |
576 | _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); | |
577 | } | |
578 | ||
579 | /* check the status */ | |
580 | if(status=='S') { | |
581 | /* check if there was a full mapping for this code point before */ | |
582 | if( caseFoldingCount>0 && | |
583 | caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && | |
584 | caseFoldings[caseFoldingCount-1].status=='F' | |
585 | ) { | |
586 | /* merge the two entries */ | |
587 | caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; | |
588 | return; | |
589 | } | |
590 | } else if(status=='F') { | |
591 | /* check if there was a simple mapping for this code point before */ | |
592 | if( caseFoldingCount>0 && | |
593 | caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && | |
594 | caseFoldings[caseFoldingCount-1].status=='S' | |
595 | ) { | |
596 | /* merge the two entries */ | |
597 | uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); | |
598 | return; | |
599 | } | |
600 | } else if(status=='I' || status=='T') { | |
601 | /* check if there was a default mapping for this code point before (remove it) */ | |
602 | while(caseFoldingCount>0 && | |
603 | caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code | |
604 | ) { | |
605 | prevCode=0; | |
606 | --caseFoldingCount; | |
607 | } | |
608 | /* store only a marker for special handling for cases like dotless i */ | |
609 | caseFoldings[caseFoldingCount].simple=0; | |
610 | caseFoldings[caseFoldingCount].full[0]=0; | |
611 | } | |
612 | ||
613 | /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ | |
614 | if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { | |
615 | fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", | |
616 | (unsigned long)caseFoldings[caseFoldingCount].code, | |
617 | (unsigned long)prevCode); | |
618 | *pErrorCode=U_PARSE_ERROR; | |
619 | exit(U_PARSE_ERROR); | |
620 | } | |
621 | prevCode=caseFoldings[caseFoldingCount].code; | |
622 | ||
623 | if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { | |
624 | fprintf(stderr, "gencase: too many case folding mappings\n"); | |
625 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
626 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
627 | } | |
628 | } | |
629 | ||
630 | static void | |
631 | parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { | |
632 | char *fields[3][2]; | |
633 | ||
634 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
635 | return; | |
636 | } | |
637 | ||
638 | u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); | |
639 | } | |
640 | ||
641 | /* parser for UnicodeData.txt ----------------------------------------------- */ | |
642 | ||
643 | /* general categories */ | |
644 | const char *const | |
645 | genCategoryNames[U_CHAR_CATEGORY_COUNT]={ | |
646 | "Cn", | |
647 | "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", | |
648 | "Mc", "Nd", "Nl", "No", | |
649 | "Zs", "Zl", "Zp", | |
650 | "Cc", "Cf", "Co", "Cs", | |
651 | "Pd", "Ps", "Pe", "Pc", "Po", | |
652 | "Sm", "Sc", "Sk", "So", | |
653 | "Pi", "Pf" | |
654 | }; | |
655 | ||
656 | static int32_t specialCasingIndex=0, caseFoldingIndex=0; | |
657 | ||
658 | static void U_CALLCONV | |
659 | unicodeDataLineFn(void *context, | |
660 | char *fields[][2], int32_t fieldCount, | |
661 | UErrorCode *pErrorCode) { | |
662 | Props p; | |
663 | char *end; | |
664 | static UChar32 prevCode=0; | |
665 | UChar32 value; | |
666 | int32_t i; | |
667 | ||
668 | /* reset the properties */ | |
669 | uprv_memset(&p, 0, sizeof(Props)); | |
670 | ||
671 | /* get the character code, field 0 */ | |
672 | p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); | |
673 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
674 | fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); | |
675 | *pErrorCode=U_PARSE_ERROR; | |
676 | exit(U_PARSE_ERROR); | |
677 | } | |
678 | ||
679 | /* get general category, field 2 */ | |
680 | i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); | |
681 | if(i>=0) { | |
682 | p.gc=(uint8_t)i; | |
683 | } else { | |
684 | fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n", | |
685 | fields[2][0], (unsigned long)p.code); | |
686 | *pErrorCode=U_PARSE_ERROR; | |
687 | exit(U_PARSE_ERROR); | |
688 | } | |
689 | ||
690 | /* get canonical combining class, field 3 */ | |
691 | value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); | |
692 | if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { | |
693 | fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); | |
694 | *pErrorCode=U_PARSE_ERROR; | |
695 | exit(U_PARSE_ERROR); | |
696 | } | |
697 | p.cc=(uint8_t)value; | |
698 | ||
699 | /* get uppercase mapping, field 12 */ | |
700 | value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); | |
701 | if(end!=fields[12][1]) { | |
702 | fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", | |
703 | (unsigned long)p.code); | |
704 | *pErrorCode=U_PARSE_ERROR; | |
705 | exit(U_PARSE_ERROR); | |
706 | } | |
707 | if(value!=0 && value!=p.code) { | |
708 | p.upperCase=value; | |
709 | uset_add(caseSensitive, p.code); | |
710 | uset_add(caseSensitive, value); | |
711 | } | |
712 | ||
713 | /* get lowercase value, field 13 */ | |
714 | value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); | |
715 | if(end!=fields[13][1]) { | |
716 | fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", | |
717 | (unsigned long)p.code); | |
718 | *pErrorCode=U_PARSE_ERROR; | |
719 | exit(U_PARSE_ERROR); | |
720 | } | |
721 | if(value!=0 && value!=p.code) { | |
722 | p.lowerCase=value; | |
723 | uset_add(caseSensitive, p.code); | |
724 | uset_add(caseSensitive, value); | |
725 | } | |
726 | ||
727 | /* get titlecase value, field 14 */ | |
728 | value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); | |
729 | if(end!=fields[14][1]) { | |
730 | fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", | |
731 | (unsigned long)p.code); | |
732 | *pErrorCode=U_PARSE_ERROR; | |
733 | exit(U_PARSE_ERROR); | |
734 | } | |
735 | if(value!=0 && value!=p.code) { | |
736 | p.titleCase=value; | |
737 | uset_add(caseSensitive, p.code); | |
738 | uset_add(caseSensitive, value); | |
739 | } | |
740 | ||
741 | /* set additional properties from previously parsed files */ | |
742 | if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) { | |
743 | p.specialCasing=specialCasings+specialCasingIndex++; | |
744 | } else { | |
745 | p.specialCasing=NULL; | |
746 | } | |
747 | if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) { | |
748 | p.caseFolding=caseFoldings+caseFoldingIndex++; | |
749 | ||
750 | /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */ | |
751 | if( p.caseFolding->status=='C' && | |
752 | p.caseFolding->simple==p.lowerCase | |
753 | ) { | |
754 | p.caseFolding=NULL; | |
755 | } | |
756 | } else { | |
757 | p.caseFolding=NULL; | |
758 | } | |
759 | ||
760 | /* check for non-character code points */ | |
761 | if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { | |
762 | fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", | |
763 | (unsigned long)p.code); | |
764 | *pErrorCode=U_PARSE_ERROR; | |
765 | exit(U_PARSE_ERROR); | |
766 | } | |
767 | ||
768 | /* check that the code points (p.code) are in ascending order */ | |
769 | if(p.code<=prevCode && p.code>0) { | |
770 | fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", | |
771 | (unsigned long)p.code, (unsigned long)prevCode); | |
772 | *pErrorCode=U_PARSE_ERROR; | |
773 | exit(U_PARSE_ERROR); | |
774 | } | |
775 | ||
776 | /* properties for a single code point */ | |
777 | setProps(&p); | |
778 | ||
779 | prevCode=p.code; | |
780 | } | |
781 | ||
782 | static void | |
783 | parseDB(const char *filename, UErrorCode *pErrorCode) { | |
784 | char *fields[15][2]; | |
785 | UChar32 start, end; | |
786 | int32_t i; | |
787 | ||
788 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
789 | return; | |
790 | } | |
791 | ||
792 | u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); | |
793 | ||
794 | /* are all sub-properties consumed? */ | |
795 | if(specialCasingIndex<specialCasingCount) { | |
796 | fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n"); | |
797 | *pErrorCode=U_PARSE_ERROR; | |
798 | exit(U_PARSE_ERROR); | |
799 | } | |
800 | if(caseFoldingIndex<caseFoldingCount) { | |
801 | fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n"); | |
802 | *pErrorCode=U_PARSE_ERROR; | |
803 | exit(U_PARSE_ERROR); | |
804 | } | |
805 | ||
806 | if(U_FAILURE(*pErrorCode)) { | |
807 | return; | |
808 | } | |
809 | ||
810 | for(i=0; | |
811 | 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode); | |
812 | ++i | |
813 | ) { | |
814 | addCaseSensitive(start, end); | |
815 | } | |
816 | if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) { | |
817 | *pErrorCode=U_ZERO_ERROR; | |
818 | } | |
819 | } | |
820 | ||
821 | /* | |
822 | * Hey, Emacs, please set the following: | |
823 | * | |
824 | * Local Variables: | |
825 | * indent-tabs-mode: nil | |
826 | * End: | |
827 | * | |
828 | */ |