]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
46f4442e | 4 | * Copyright (C) 2004-2008, International Business Machines |
374ca955 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: gencase.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004aug28 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This program reads several of the Unicode character database text files, | |
17 | * parses them, and the case mapping properties for each character. | |
18 | * It then writes a binary file containing the properties | |
19 | * that is designed to be used directly for random-access to | |
20 | * the properties of each Unicode character. | |
21 | */ | |
22 | ||
23 | #include <stdio.h> | |
24 | #include "unicode/utypes.h" | |
25 | #include "unicode/uchar.h" | |
26 | #include "unicode/uset.h" | |
27 | #include "unicode/putil.h" | |
28 | #include "unicode/uclean.h" | |
29 | #include "cmemory.h" | |
30 | #include "cstring.h" | |
31 | #include "uarrsort.h" | |
32 | #include "unewdata.h" | |
33 | #include "uoptions.h" | |
34 | #include "uparse.h" | |
35 | #include "uprops.h" | |
36 | #include "propsvec.h" | |
37 | #include "gencase.h" | |
38 | ||
39 | #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) | |
40 | ||
41 | /* data --------------------------------------------------------------------- */ | |
42 | ||
43 | uint32_t *pv; | |
44 | ||
45 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
46 | ||
47 | /* | |
48 | * Unicode set collecting the case-sensitive characters; | |
49 | * see uchar.h UCHAR_CASE_SENSITIVE. | |
50 | * Add code points from case mappings/foldings in | |
51 | * the root locale and with default options. | |
52 | */ | |
53 | static USet *caseSensitive; | |
54 | ||
55 | /* prototypes --------------------------------------------------------------- */ | |
56 | ||
57 | static void | |
58 | parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); | |
59 | ||
60 | static void | |
61 | parseCaseFolding(const char *filename, UErrorCode *pErrorCode); | |
62 | ||
63 | static void | |
64 | parseDB(const char *filename, UErrorCode *pErrorCode); | |
65 | ||
66 | /* parse files with multiple binary properties ------------------------------ */ | |
67 | ||
68 | /* TODO: more common code, move functions to uparse.h|c */ | |
69 | ||
70 | /* TODO: similar to genprops/props2.c but not the same */ | |
71 | ||
72 | struct Binary { | |
73 | const char *propName; | |
74 | int32_t vecWord; | |
75 | uint32_t vecValue, vecMask; | |
76 | }; | |
77 | typedef struct Binary Binary; | |
78 | ||
79 | struct Binaries { | |
80 | const char *ucdFile; | |
81 | const Binary *binaries; | |
82 | int32_t binariesCount; | |
83 | }; | |
84 | typedef struct Binaries Binaries; | |
85 | ||
86 | static const Binary | |
87 | propListNames[]={ | |
88 | { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK } | |
89 | }; | |
90 | ||
91 | static const Binaries | |
92 | propListBinaries={ | |
93 | "PropList", propListNames, LENGTHOF(propListNames) | |
94 | }; | |
95 | ||
96 | static const Binary | |
97 | derCorePropsNames[]={ | |
98 | { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK }, | |
99 | { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK } | |
100 | }; | |
101 | ||
102 | static const Binaries | |
103 | derCorePropsBinaries={ | |
104 | "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) | |
105 | }; | |
106 | ||
46f4442e A |
107 | /* |
108 | * Treat Word_Break=MidLetter and MidNumLet as a single binary property. | |
109 | * We need not distinguish between them because both add to case-ignorable. | |
110 | * We ignore all other Word_Break values. | |
111 | */ | |
73c04bcf A |
112 | static const Binary |
113 | wordBreakNames[]={ | |
46f4442e A |
114 | { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }, |
115 | { "MidNumLet", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) } | |
73c04bcf A |
116 | }; |
117 | ||
118 | static const Binaries | |
119 | wordBreakBinaries={ | |
120 | "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames) | |
121 | }; | |
122 | ||
374ca955 A |
123 | static void U_CALLCONV |
124 | binariesLineFn(void *context, | |
125 | char *fields[][2], int32_t fieldCount, | |
126 | UErrorCode *pErrorCode) { | |
127 | const Binaries *bin; | |
128 | char *s; | |
129 | uint32_t start, limit; | |
130 | int32_t i; | |
131 | ||
132 | bin=(const Binaries *)context; | |
133 | ||
134 | u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode); | |
135 | if(U_FAILURE(*pErrorCode)) { | |
136 | fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); | |
137 | exit(*pErrorCode); | |
138 | } | |
139 | ++limit; | |
140 | ||
141 | /* parse binary property name */ | |
142 | s=(char *)u_skipWhitespace(fields[1][0]); | |
143 | for(i=0;; ++i) { | |
144 | if(i==bin->binariesCount) { | |
145 | /* ignore unrecognized properties */ | |
146 | return; | |
147 | } | |
148 | if(isToken(bin->binaries[i].propName, s)) { | |
149 | break; | |
150 | } | |
151 | } | |
152 | ||
153 | if(bin->binaries[i].vecMask==0) { | |
154 | fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n", | |
155 | (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); | |
156 | exit(U_INTERNAL_PROGRAM_ERROR); | |
157 | } | |
158 | ||
159 | if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) { | |
160 | fprintf(stderr, "gencase error: unable to set %s, code: %s\n", | |
161 | bin->binaries[i].propName, u_errorName(*pErrorCode)); | |
162 | exit(*pErrorCode); | |
163 | } | |
164 | } | |
165 | ||
166 | static void | |
167 | parseBinariesFile(char *filename, char *basename, const char *suffix, | |
168 | const Binaries *bin, | |
169 | UErrorCode *pErrorCode) { | |
170 | char *fields[2][2]; | |
171 | ||
172 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
173 | return; | |
174 | } | |
175 | ||
176 | writeUCDFilename(basename, bin->ucdFile, suffix); | |
177 | ||
178 | u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); | |
179 | if(U_FAILURE(*pErrorCode)) { | |
180 | fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); | |
181 | } | |
182 | } | |
183 | ||
184 | /* -------------------------------------------------------------------------- */ | |
185 | ||
186 | enum | |
187 | { | |
188 | HELP_H, | |
189 | HELP_QUESTION_MARK, | |
190 | VERBOSE, | |
191 | COPYRIGHT, | |
192 | DESTDIR, | |
193 | SOURCEDIR, | |
194 | UNICODE_VERSION, | |
73c04bcf A |
195 | ICUDATADIR, |
196 | CSOURCE | |
374ca955 A |
197 | }; |
198 | ||
199 | /* Keep these values in sync with the above enums */ | |
200 | static UOption options[]={ | |
201 | UOPTION_HELP_H, | |
202 | UOPTION_HELP_QUESTION_MARK, | |
203 | UOPTION_VERBOSE, | |
204 | UOPTION_COPYRIGHT, | |
205 | UOPTION_DESTDIR, | |
206 | UOPTION_SOURCEDIR, | |
73c04bcf A |
207 | UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), |
208 | UOPTION_ICUDATADIR, | |
209 | UOPTION_DEF("csource", 'C', UOPT_NO_ARG) | |
374ca955 A |
210 | }; |
211 | ||
212 | extern int | |
213 | main(int argc, char* argv[]) { | |
214 | char filename[300]; | |
215 | const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; | |
216 | char *basename=NULL; | |
217 | UErrorCode errorCode=U_ZERO_ERROR; | |
218 | ||
219 | U_MAIN_INIT_ARGS(argc, argv); | |
220 | ||
221 | /* preset then read command line options */ | |
222 | options[DESTDIR].value=u_getDataDirectory(); | |
223 | options[SOURCEDIR].value=""; | |
224 | options[UNICODE_VERSION].value=""; | |
225 | options[ICUDATADIR].value=u_getDataDirectory(); | |
226 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); | |
227 | ||
228 | /* error handling, printing usage message */ | |
229 | if(argc<0) { | |
230 | fprintf(stderr, | |
231 | "error in command line argument \"%s\"\n", | |
232 | argv[-argc]); | |
233 | } | |
234 | if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { | |
235 | /* | |
236 | * Broken into chucks because the C89 standard says the minimum | |
237 | * required supported string length is 509 bytes. | |
238 | */ | |
239 | fprintf(stderr, | |
240 | "Usage: %s [-options] [suffix]\n" | |
241 | "\n" | |
242 | "read the UnicodeData.txt file and other Unicode properties files and\n" | |
243 | "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n" | |
244 | "\n", | |
245 | argv[0]); | |
246 | fprintf(stderr, | |
247 | "Options:\n" | |
248 | "\t-h or -? or --help this usage text\n" | |
249 | "\t-v or --verbose verbose output\n" | |
250 | "\t-c or --copyright include a copyright notice\n" | |
73c04bcf A |
251 | "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" |
252 | "\t-C or --csource generate a .c source file rather than the .icu binary\n"); | |
374ca955 A |
253 | fprintf(stderr, |
254 | "\t-d or --destdir destination directory, followed by the path\n" | |
255 | "\t-s or --sourcedir source directory, followed by the path\n" | |
256 | "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" | |
257 | "\t followed by path, defaults to %s\n" | |
258 | "\tsuffix suffix that is to be appended with a '-'\n" | |
259 | "\t to the source file basenames before opening;\n" | |
260 | "\t 'gencase new' will read UnicodeData-new.txt etc.\n", | |
261 | u_getDataDirectory()); | |
262 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
263 | } | |
264 | ||
265 | /* get the options values */ | |
266 | beVerbose=options[VERBOSE].doesOccur; | |
267 | haveCopyright=options[COPYRIGHT].doesOccur; | |
268 | srcDir=options[SOURCEDIR].value; | |
269 | destDir=options[DESTDIR].value; | |
270 | ||
271 | if(argc>=2) { | |
272 | suffix=argv[1]; | |
273 | } else { | |
274 | suffix=NULL; | |
275 | } | |
276 | ||
277 | if(options[UNICODE_VERSION].doesOccur) { | |
278 | setUnicodeVersion(options[UNICODE_VERSION].value); | |
279 | } | |
280 | /* else use the default dataVersion in store.c */ | |
281 | ||
282 | if (options[ICUDATADIR].doesOccur) { | |
283 | u_setDataDirectory(options[ICUDATADIR].value); | |
284 | } | |
285 | ||
286 | /* prepare the filename beginning with the source dir */ | |
287 | uprv_strcpy(filename, srcDir); | |
288 | basename=filename+uprv_strlen(filename); | |
289 | if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { | |
290 | *basename++=U_FILE_SEP_CHAR; | |
291 | } | |
292 | ||
293 | /* initialize */ | |
73c04bcf | 294 | pv=upvec_open(2, 10000); |
374ca955 A |
295 | caseSensitive=uset_open(1, 0); /* empty set (start>end) */ |
296 | ||
297 | /* process SpecialCasing.txt */ | |
298 | writeUCDFilename(basename, "SpecialCasing", suffix); | |
299 | parseSpecialCasing(filename, &errorCode); | |
300 | ||
301 | /* process CaseFolding.txt */ | |
302 | writeUCDFilename(basename, "CaseFolding", suffix); | |
303 | parseCaseFolding(filename, &errorCode); | |
304 | ||
305 | /* process additional properties files */ | |
306 | *basename=0; | |
307 | ||
308 | parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode); | |
309 | ||
310 | parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode); | |
311 | ||
73c04bcf A |
312 | if(ucdVersion>=UNI_4_1) { |
313 | parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode); | |
314 | } | |
315 | ||
374ca955 A |
316 | /* process UnicodeData.txt */ |
317 | writeUCDFilename(basename, "UnicodeData", suffix); | |
318 | parseDB(filename, &errorCode); | |
319 | ||
320 | /* process parsed data */ | |
321 | makeCaseClosure(); | |
322 | ||
323 | makeExceptions(); | |
324 | ||
325 | if(U_SUCCESS(errorCode)) { | |
326 | /* write the properties data file */ | |
73c04bcf | 327 | generateData(destDir, options[CSOURCE].doesOccur); |
374ca955 A |
328 | } |
329 | ||
330 | u_cleanup(); | |
331 | return errorCode; | |
332 | } | |
333 | ||
334 | U_CFUNC void | |
335 | writeUCDFilename(char *basename, const char *filename, const char *suffix) { | |
336 | int32_t length=(int32_t)uprv_strlen(filename); | |
337 | uprv_strcpy(basename, filename); | |
338 | if(suffix!=NULL) { | |
339 | basename[length++]='-'; | |
340 | uprv_strcpy(basename+length, suffix); | |
341 | length+=(int32_t)uprv_strlen(suffix); | |
342 | } | |
343 | uprv_strcpy(basename+length, ".txt"); | |
344 | } | |
345 | ||
346 | /* TODO: move to toolutil */ | |
347 | U_CFUNC UBool | |
348 | isToken(const char *token, const char *s) { | |
349 | const char *z; | |
350 | int32_t j; | |
351 | ||
352 | s=u_skipWhitespace(s); | |
353 | for(j=0;; ++j) { | |
354 | if(token[j]!=0) { | |
355 | if(s[j]!=token[j]) { | |
356 | break; | |
357 | } | |
358 | } else { | |
359 | z=u_skipWhitespace(s+j); | |
360 | if(*z==';' || *z==0) { | |
361 | return TRUE; | |
362 | } else { | |
363 | break; | |
364 | } | |
365 | } | |
366 | } | |
367 | ||
368 | return FALSE; | |
369 | } | |
370 | ||
371 | static int32_t | |
372 | getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { | |
373 | const char *t, *z; | |
374 | int32_t i, j; | |
375 | ||
376 | s=u_skipWhitespace(s); | |
377 | for(i=0; i<countTokens; ++i) { | |
378 | t=tokens[i]; | |
379 | if(t!=NULL) { | |
380 | for(j=0;; ++j) { | |
381 | if(t[j]!=0) { | |
382 | if(s[j]!=t[j]) { | |
383 | break; | |
384 | } | |
385 | } else { | |
386 | z=u_skipWhitespace(s+j); | |
387 | if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') { | |
388 | return i; | |
389 | } else { | |
390 | break; | |
391 | } | |
392 | } | |
393 | } | |
394 | } | |
395 | } | |
396 | return -1; | |
397 | } | |
398 | ||
399 | static void | |
400 | _set_addAll(USet *set, const UChar *s, int32_t length) { | |
401 | UChar32 c; | |
402 | int32_t i; | |
403 | ||
404 | /* needs length>=0 */ | |
405 | for(i=0; i<length; /* U16_NEXT advances i */) { | |
406 | U16_NEXT(s, i, length, c); | |
407 | uset_add(set, c); | |
408 | } | |
409 | } | |
410 | ||
411 | /* parser for SpecialCasing.txt --------------------------------------------- */ | |
412 | ||
413 | #define MAX_SPECIAL_CASING_COUNT 500 | |
414 | ||
415 | static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT]; | |
416 | static int32_t specialCasingCount=0; | |
417 | ||
418 | static void U_CALLCONV | |
419 | specialCasingLineFn(void *context, | |
420 | char *fields[][2], int32_t fieldCount, | |
421 | UErrorCode *pErrorCode) { | |
422 | char *end; | |
423 | ||
424 | /* get code point */ | |
425 | specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); | |
426 | end=(char *)u_skipWhitespace(end); | |
427 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
428 | fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); | |
429 | *pErrorCode=U_PARSE_ERROR; | |
430 | exit(U_PARSE_ERROR); | |
431 | } | |
432 | ||
433 | /* is this a complex mapping? */ | |
434 | if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { | |
435 | /* there is some condition text in the fifth field */ | |
436 | specialCasings[specialCasingCount].isComplex=TRUE; | |
437 | ||
438 | /* do not store any actual mappings for this */ | |
439 | specialCasings[specialCasingCount].lowerCase[0]=0; | |
440 | specialCasings[specialCasingCount].upperCase[0]=0; | |
441 | specialCasings[specialCasingCount].titleCase[0]=0; | |
442 | } else { | |
443 | /* just set the "complex" flag and get the case mappings */ | |
444 | specialCasings[specialCasingCount].isComplex=FALSE; | |
445 | specialCasings[specialCasingCount].lowerCase[0]= | |
446 | (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); | |
447 | specialCasings[specialCasingCount].upperCase[0]= | |
448 | (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); | |
449 | specialCasings[specialCasingCount].titleCase[0]= | |
450 | (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); | |
451 | if(U_FAILURE(*pErrorCode)) { | |
452 | fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]); | |
453 | exit(*pErrorCode); | |
454 | } | |
455 | ||
456 | uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); | |
457 | _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); | |
458 | _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); | |
459 | _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); | |
460 | } | |
461 | ||
462 | if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { | |
463 | fprintf(stderr, "gencase: too many special casing mappings\n"); | |
464 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
465 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
466 | } | |
467 | } | |
468 | ||
469 | static int32_t U_CALLCONV | |
470 | compareSpecialCasings(const void *context, const void *left, const void *right) { | |
471 | return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code; | |
472 | } | |
473 | ||
474 | static void | |
475 | parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { | |
476 | char *fields[5][2]; | |
477 | int32_t i, j; | |
478 | ||
479 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
480 | return; | |
481 | } | |
482 | ||
483 | u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); | |
484 | ||
485 | /* sort the special casing entries by code point */ | |
486 | if(specialCasingCount>0) { | |
487 | uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), | |
488 | compareSpecialCasings, NULL, FALSE, pErrorCode); | |
489 | } | |
490 | if(U_FAILURE(*pErrorCode)) { | |
491 | return; | |
492 | } | |
493 | ||
494 | /* replace multiple entries for any code point by one "complex" one */ | |
495 | j=0; | |
496 | for(i=1; i<specialCasingCount; ++i) { | |
497 | if(specialCasings[i-1].code==specialCasings[i].code) { | |
498 | /* there is a duplicate code point */ | |
499 | specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */ | |
500 | specialCasings[i].isComplex=TRUE; /* make the following one complex */ | |
501 | specialCasings[i].lowerCase[0]=0; | |
502 | specialCasings[i].upperCase[0]=0; | |
503 | specialCasings[i].titleCase[0]=0; | |
504 | ++j; | |
505 | } | |
506 | } | |
507 | ||
508 | /* if some entries just were removed, then re-sort */ | |
509 | if(j>0) { | |
510 | uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), | |
511 | compareSpecialCasings, NULL, FALSE, pErrorCode); | |
512 | specialCasingCount-=j; | |
513 | } | |
514 | if(U_FAILURE(*pErrorCode)) { | |
515 | return; | |
516 | } | |
517 | ||
518 | /* | |
519 | * Add one complex mapping to caseSensitive that was filtered out above: | |
520 | * Greek final Sigma has a conditional mapping but not locale-sensitive, | |
521 | * and it is taken when lowercasing just U+03A3 alone. | |
522 | * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA | |
523 | */ | |
524 | uset_add(caseSensitive, 0x3c2); | |
525 | } | |
526 | ||
527 | /* parser for CaseFolding.txt ----------------------------------------------- */ | |
528 | ||
529 | #define MAX_CASE_FOLDING_COUNT 2000 | |
530 | ||
531 | static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; | |
532 | static int32_t caseFoldingCount=0; | |
533 | ||
534 | static void U_CALLCONV | |
535 | caseFoldingLineFn(void *context, | |
536 | char *fields[][2], int32_t fieldCount, | |
537 | UErrorCode *pErrorCode) { | |
538 | char *end; | |
539 | static UChar32 prevCode=0; | |
540 | int32_t count; | |
541 | char status; | |
542 | ||
543 | /* get code point */ | |
544 | caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); | |
545 | end=(char *)u_skipWhitespace(end); | |
546 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
547 | fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); | |
548 | *pErrorCode=U_PARSE_ERROR; | |
549 | exit(U_PARSE_ERROR); | |
550 | } | |
551 | ||
552 | /* get the status of this mapping */ | |
553 | caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); | |
554 | if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { | |
555 | fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); | |
556 | *pErrorCode=U_PARSE_ERROR; | |
557 | exit(U_PARSE_ERROR); | |
558 | } | |
559 | ||
560 | /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ | |
561 | if(status=='L') { | |
562 | return; | |
563 | } | |
564 | ||
565 | /* get the mapping */ | |
566 | count=caseFoldings[caseFoldingCount].full[0]= | |
567 | (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); | |
568 | if(U_FAILURE(*pErrorCode)) { | |
569 | fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); | |
570 | exit(*pErrorCode); | |
571 | } | |
572 | ||
573 | /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ | |
574 | if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { | |
575 | caseFoldings[caseFoldingCount].simple=0; | |
576 | } | |
577 | ||
578 | /* update the case-sensitive set */ | |
579 | if(status!='T') { | |
580 | uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); | |
581 | _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); | |
582 | } | |
583 | ||
584 | /* check the status */ | |
585 | if(status=='S') { | |
586 | /* check if there was a full mapping for this code point before */ | |
587 | if( caseFoldingCount>0 && | |
588 | caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && | |
589 | caseFoldings[caseFoldingCount-1].status=='F' | |
590 | ) { | |
591 | /* merge the two entries */ | |
592 | caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; | |
593 | return; | |
594 | } | |
595 | } else if(status=='F') { | |
596 | /* check if there was a simple mapping for this code point before */ | |
597 | if( caseFoldingCount>0 && | |
598 | caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && | |
599 | caseFoldings[caseFoldingCount-1].status=='S' | |
600 | ) { | |
601 | /* merge the two entries */ | |
602 | uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); | |
603 | return; | |
604 | } | |
605 | } else if(status=='I' || status=='T') { | |
606 | /* check if there was a default mapping for this code point before (remove it) */ | |
607 | while(caseFoldingCount>0 && | |
608 | caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code | |
609 | ) { | |
610 | prevCode=0; | |
611 | --caseFoldingCount; | |
612 | } | |
613 | /* store only a marker for special handling for cases like dotless i */ | |
614 | caseFoldings[caseFoldingCount].simple=0; | |
615 | caseFoldings[caseFoldingCount].full[0]=0; | |
616 | } | |
617 | ||
618 | /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ | |
619 | if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { | |
620 | fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", | |
621 | (unsigned long)caseFoldings[caseFoldingCount].code, | |
622 | (unsigned long)prevCode); | |
623 | *pErrorCode=U_PARSE_ERROR; | |
624 | exit(U_PARSE_ERROR); | |
625 | } | |
626 | prevCode=caseFoldings[caseFoldingCount].code; | |
627 | ||
628 | if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { | |
629 | fprintf(stderr, "gencase: too many case folding mappings\n"); | |
630 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
631 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
632 | } | |
633 | } | |
634 | ||
635 | static void | |
636 | parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { | |
637 | char *fields[3][2]; | |
638 | ||
639 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
640 | return; | |
641 | } | |
642 | ||
643 | u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); | |
644 | } | |
645 | ||
646 | /* parser for UnicodeData.txt ----------------------------------------------- */ | |
647 | ||
648 | /* general categories */ | |
649 | const char *const | |
650 | genCategoryNames[U_CHAR_CATEGORY_COUNT]={ | |
651 | "Cn", | |
652 | "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", | |
653 | "Mc", "Nd", "Nl", "No", | |
654 | "Zs", "Zl", "Zp", | |
655 | "Cc", "Cf", "Co", "Cs", | |
656 | "Pd", "Ps", "Pe", "Pc", "Po", | |
657 | "Sm", "Sc", "Sk", "So", | |
658 | "Pi", "Pf" | |
659 | }; | |
660 | ||
661 | static int32_t specialCasingIndex=0, caseFoldingIndex=0; | |
662 | ||
663 | static void U_CALLCONV | |
664 | unicodeDataLineFn(void *context, | |
665 | char *fields[][2], int32_t fieldCount, | |
666 | UErrorCode *pErrorCode) { | |
667 | Props p; | |
668 | char *end; | |
669 | static UChar32 prevCode=0; | |
670 | UChar32 value; | |
671 | int32_t i; | |
672 | ||
673 | /* reset the properties */ | |
674 | uprv_memset(&p, 0, sizeof(Props)); | |
675 | ||
676 | /* get the character code, field 0 */ | |
677 | p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); | |
678 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
679 | fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); | |
680 | *pErrorCode=U_PARSE_ERROR; | |
681 | exit(U_PARSE_ERROR); | |
682 | } | |
683 | ||
684 | /* get general category, field 2 */ | |
685 | i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); | |
686 | if(i>=0) { | |
687 | p.gc=(uint8_t)i; | |
688 | } else { | |
689 | fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n", | |
690 | fields[2][0], (unsigned long)p.code); | |
691 | *pErrorCode=U_PARSE_ERROR; | |
692 | exit(U_PARSE_ERROR); | |
693 | } | |
694 | ||
695 | /* get canonical combining class, field 3 */ | |
696 | value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); | |
697 | if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { | |
698 | fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); | |
699 | *pErrorCode=U_PARSE_ERROR; | |
700 | exit(U_PARSE_ERROR); | |
701 | } | |
702 | p.cc=(uint8_t)value; | |
703 | ||
704 | /* get uppercase mapping, field 12 */ | |
705 | value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); | |
706 | if(end!=fields[12][1]) { | |
707 | fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", | |
708 | (unsigned long)p.code); | |
709 | *pErrorCode=U_PARSE_ERROR; | |
710 | exit(U_PARSE_ERROR); | |
711 | } | |
712 | if(value!=0 && value!=p.code) { | |
713 | p.upperCase=value; | |
714 | uset_add(caseSensitive, p.code); | |
715 | uset_add(caseSensitive, value); | |
716 | } | |
717 | ||
718 | /* get lowercase value, field 13 */ | |
719 | value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); | |
720 | if(end!=fields[13][1]) { | |
721 | fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", | |
722 | (unsigned long)p.code); | |
723 | *pErrorCode=U_PARSE_ERROR; | |
724 | exit(U_PARSE_ERROR); | |
725 | } | |
726 | if(value!=0 && value!=p.code) { | |
727 | p.lowerCase=value; | |
728 | uset_add(caseSensitive, p.code); | |
729 | uset_add(caseSensitive, value); | |
730 | } | |
731 | ||
732 | /* get titlecase value, field 14 */ | |
733 | value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); | |
734 | if(end!=fields[14][1]) { | |
735 | fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", | |
736 | (unsigned long)p.code); | |
737 | *pErrorCode=U_PARSE_ERROR; | |
738 | exit(U_PARSE_ERROR); | |
739 | } | |
740 | if(value!=0 && value!=p.code) { | |
741 | p.titleCase=value; | |
742 | uset_add(caseSensitive, p.code); | |
743 | uset_add(caseSensitive, value); | |
744 | } | |
745 | ||
746 | /* set additional properties from previously parsed files */ | |
747 | if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) { | |
748 | p.specialCasing=specialCasings+specialCasingIndex++; | |
749 | } else { | |
750 | p.specialCasing=NULL; | |
751 | } | |
752 | if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) { | |
753 | p.caseFolding=caseFoldings+caseFoldingIndex++; | |
754 | ||
755 | /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */ | |
756 | if( p.caseFolding->status=='C' && | |
757 | p.caseFolding->simple==p.lowerCase | |
758 | ) { | |
759 | p.caseFolding=NULL; | |
760 | } | |
761 | } else { | |
762 | p.caseFolding=NULL; | |
763 | } | |
764 | ||
765 | /* check for non-character code points */ | |
766 | if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { | |
767 | fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", | |
768 | (unsigned long)p.code); | |
769 | *pErrorCode=U_PARSE_ERROR; | |
770 | exit(U_PARSE_ERROR); | |
771 | } | |
772 | ||
773 | /* check that the code points (p.code) are in ascending order */ | |
774 | if(p.code<=prevCode && p.code>0) { | |
775 | fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", | |
776 | (unsigned long)p.code, (unsigned long)prevCode); | |
777 | *pErrorCode=U_PARSE_ERROR; | |
778 | exit(U_PARSE_ERROR); | |
779 | } | |
780 | ||
781 | /* properties for a single code point */ | |
782 | setProps(&p); | |
783 | ||
784 | prevCode=p.code; | |
785 | } | |
786 | ||
787 | static void | |
788 | parseDB(const char *filename, UErrorCode *pErrorCode) { | |
789 | char *fields[15][2]; | |
790 | UChar32 start, end; | |
791 | int32_t i; | |
792 | ||
793 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
794 | return; | |
795 | } | |
796 | ||
797 | u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); | |
798 | ||
799 | /* are all sub-properties consumed? */ | |
800 | if(specialCasingIndex<specialCasingCount) { | |
801 | fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n"); | |
802 | *pErrorCode=U_PARSE_ERROR; | |
803 | exit(U_PARSE_ERROR); | |
804 | } | |
805 | if(caseFoldingIndex<caseFoldingCount) { | |
806 | fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n"); | |
807 | *pErrorCode=U_PARSE_ERROR; | |
808 | exit(U_PARSE_ERROR); | |
809 | } | |
810 | ||
811 | if(U_FAILURE(*pErrorCode)) { | |
812 | return; | |
813 | } | |
814 | ||
815 | for(i=0; | |
816 | 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode); | |
817 | ++i | |
818 | ) { | |
819 | addCaseSensitive(start, end); | |
820 | } | |
821 | if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) { | |
822 | *pErrorCode=U_ZERO_ERROR; | |
823 | } | |
824 | } | |
825 | ||
826 | /* | |
827 | * Hey, Emacs, please set the following: | |
828 | * | |
829 | * Local Variables: | |
830 | * indent-tabs-mode: nil | |
831 | * End: | |
832 | * | |
833 | */ |