]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2004, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: gencase.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004aug28 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This program reads several of the Unicode character database text files, | |
17 | * parses them, and the case mapping properties for each character. | |
18 | * It then writes a binary file containing the properties | |
19 | * that is designed to be used directly for random-access to | |
20 | * the properties of each Unicode character. | |
21 | */ | |
22 | ||
23 | #include <stdio.h> | |
24 | #include "unicode/utypes.h" | |
25 | #include "unicode/uchar.h" | |
26 | #include "unicode/uset.h" | |
27 | #include "unicode/putil.h" | |
28 | #include "unicode/uclean.h" | |
29 | #include "cmemory.h" | |
30 | #include "cstring.h" | |
31 | #include "uarrsort.h" | |
32 | #include "unewdata.h" | |
33 | #include "uoptions.h" | |
34 | #include "uparse.h" | |
35 | #include "uprops.h" | |
36 | #include "propsvec.h" | |
37 | #include "gencase.h" | |
38 | ||
39 | #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) | |
40 | ||
41 | /* data --------------------------------------------------------------------- */ | |
42 | ||
43 | uint32_t *pv; | |
44 | ||
45 | UBool beVerbose=FALSE, haveCopyright=TRUE; | |
46 | ||
47 | /* | |
48 | * Unicode set collecting the case-sensitive characters; | |
49 | * see uchar.h UCHAR_CASE_SENSITIVE. | |
50 | * Add code points from case mappings/foldings in | |
51 | * the root locale and with default options. | |
52 | */ | |
53 | static USet *caseSensitive; | |
54 | ||
55 | /* prototypes --------------------------------------------------------------- */ | |
56 | ||
57 | static void | |
58 | parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); | |
59 | ||
60 | static void | |
61 | parseCaseFolding(const char *filename, UErrorCode *pErrorCode); | |
62 | ||
63 | static void | |
64 | parseDB(const char *filename, UErrorCode *pErrorCode); | |
65 | ||
66 | /* parse files with multiple binary properties ------------------------------ */ | |
67 | ||
68 | /* TODO: more common code, move functions to uparse.h|c */ | |
69 | ||
70 | /* TODO: similar to genprops/props2.c but not the same */ | |
71 | ||
72 | struct Binary { | |
73 | const char *propName; | |
74 | int32_t vecWord; | |
75 | uint32_t vecValue, vecMask; | |
76 | }; | |
77 | typedef struct Binary Binary; | |
78 | ||
79 | struct Binaries { | |
80 | const char *ucdFile; | |
81 | const Binary *binaries; | |
82 | int32_t binariesCount; | |
83 | }; | |
84 | typedef struct Binaries Binaries; | |
85 | ||
86 | static const Binary | |
87 | propListNames[]={ | |
88 | { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK } | |
89 | }; | |
90 | ||
91 | static const Binaries | |
92 | propListBinaries={ | |
93 | "PropList", propListNames, LENGTHOF(propListNames) | |
94 | }; | |
95 | ||
96 | static const Binary | |
97 | derCorePropsNames[]={ | |
98 | { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK }, | |
99 | { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK } | |
100 | }; | |
101 | ||
102 | static const Binaries | |
103 | derCorePropsBinaries={ | |
104 | "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) | |
105 | }; | |
106 | ||
107 | static void U_CALLCONV | |
108 | binariesLineFn(void *context, | |
109 | char *fields[][2], int32_t fieldCount, | |
110 | UErrorCode *pErrorCode) { | |
111 | const Binaries *bin; | |
112 | char *s; | |
113 | uint32_t start, limit; | |
114 | int32_t i; | |
115 | ||
116 | bin=(const Binaries *)context; | |
117 | ||
118 | u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode); | |
119 | if(U_FAILURE(*pErrorCode)) { | |
120 | fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); | |
121 | exit(*pErrorCode); | |
122 | } | |
123 | ++limit; | |
124 | ||
125 | /* parse binary property name */ | |
126 | s=(char *)u_skipWhitespace(fields[1][0]); | |
127 | for(i=0;; ++i) { | |
128 | if(i==bin->binariesCount) { | |
129 | /* ignore unrecognized properties */ | |
130 | return; | |
131 | } | |
132 | if(isToken(bin->binaries[i].propName, s)) { | |
133 | break; | |
134 | } | |
135 | } | |
136 | ||
137 | if(bin->binaries[i].vecMask==0) { | |
138 | fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n", | |
139 | (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); | |
140 | exit(U_INTERNAL_PROGRAM_ERROR); | |
141 | } | |
142 | ||
143 | if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) { | |
144 | fprintf(stderr, "gencase error: unable to set %s, code: %s\n", | |
145 | bin->binaries[i].propName, u_errorName(*pErrorCode)); | |
146 | exit(*pErrorCode); | |
147 | } | |
148 | } | |
149 | ||
150 | static void | |
151 | parseBinariesFile(char *filename, char *basename, const char *suffix, | |
152 | const Binaries *bin, | |
153 | UErrorCode *pErrorCode) { | |
154 | char *fields[2][2]; | |
155 | ||
156 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
157 | return; | |
158 | } | |
159 | ||
160 | writeUCDFilename(basename, bin->ucdFile, suffix); | |
161 | ||
162 | u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); | |
163 | if(U_FAILURE(*pErrorCode)) { | |
164 | fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); | |
165 | } | |
166 | } | |
167 | ||
168 | /* -------------------------------------------------------------------------- */ | |
169 | ||
170 | enum | |
171 | { | |
172 | HELP_H, | |
173 | HELP_QUESTION_MARK, | |
174 | VERBOSE, | |
175 | COPYRIGHT, | |
176 | DESTDIR, | |
177 | SOURCEDIR, | |
178 | UNICODE_VERSION, | |
179 | ICUDATADIR | |
180 | }; | |
181 | ||
182 | /* Keep these values in sync with the above enums */ | |
183 | static UOption options[]={ | |
184 | UOPTION_HELP_H, | |
185 | UOPTION_HELP_QUESTION_MARK, | |
186 | UOPTION_VERBOSE, | |
187 | UOPTION_COPYRIGHT, | |
188 | UOPTION_DESTDIR, | |
189 | UOPTION_SOURCEDIR, | |
190 | { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, | |
191 | UOPTION_ICUDATADIR | |
192 | }; | |
193 | ||
194 | extern int | |
195 | main(int argc, char* argv[]) { | |
196 | char filename[300]; | |
197 | const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; | |
198 | char *basename=NULL; | |
199 | UErrorCode errorCode=U_ZERO_ERROR; | |
200 | ||
201 | U_MAIN_INIT_ARGS(argc, argv); | |
202 | ||
203 | /* preset then read command line options */ | |
204 | options[DESTDIR].value=u_getDataDirectory(); | |
205 | options[SOURCEDIR].value=""; | |
206 | options[UNICODE_VERSION].value=""; | |
207 | options[ICUDATADIR].value=u_getDataDirectory(); | |
208 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); | |
209 | ||
210 | /* error handling, printing usage message */ | |
211 | if(argc<0) { | |
212 | fprintf(stderr, | |
213 | "error in command line argument \"%s\"\n", | |
214 | argv[-argc]); | |
215 | } | |
216 | if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { | |
217 | /* | |
218 | * Broken into chucks because the C89 standard says the minimum | |
219 | * required supported string length is 509 bytes. | |
220 | */ | |
221 | fprintf(stderr, | |
222 | "Usage: %s [-options] [suffix]\n" | |
223 | "\n" | |
224 | "read the UnicodeData.txt file and other Unicode properties files and\n" | |
225 | "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n" | |
226 | "\n", | |
227 | argv[0]); | |
228 | fprintf(stderr, | |
229 | "Options:\n" | |
230 | "\t-h or -? or --help this usage text\n" | |
231 | "\t-v or --verbose verbose output\n" | |
232 | "\t-c or --copyright include a copyright notice\n" | |
233 | "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"); | |
234 | fprintf(stderr, | |
235 | "\t-d or --destdir destination directory, followed by the path\n" | |
236 | "\t-s or --sourcedir source directory, followed by the path\n" | |
237 | "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" | |
238 | "\t followed by path, defaults to %s\n" | |
239 | "\tsuffix suffix that is to be appended with a '-'\n" | |
240 | "\t to the source file basenames before opening;\n" | |
241 | "\t 'gencase new' will read UnicodeData-new.txt etc.\n", | |
242 | u_getDataDirectory()); | |
243 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
244 | } | |
245 | ||
246 | /* get the options values */ | |
247 | beVerbose=options[VERBOSE].doesOccur; | |
248 | haveCopyright=options[COPYRIGHT].doesOccur; | |
249 | srcDir=options[SOURCEDIR].value; | |
250 | destDir=options[DESTDIR].value; | |
251 | ||
252 | if(argc>=2) { | |
253 | suffix=argv[1]; | |
254 | } else { | |
255 | suffix=NULL; | |
256 | } | |
257 | ||
258 | if(options[UNICODE_VERSION].doesOccur) { | |
259 | setUnicodeVersion(options[UNICODE_VERSION].value); | |
260 | } | |
261 | /* else use the default dataVersion in store.c */ | |
262 | ||
263 | if (options[ICUDATADIR].doesOccur) { | |
264 | u_setDataDirectory(options[ICUDATADIR].value); | |
265 | } | |
266 | ||
267 | /* prepare the filename beginning with the source dir */ | |
268 | uprv_strcpy(filename, srcDir); | |
269 | basename=filename+uprv_strlen(filename); | |
270 | if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { | |
271 | *basename++=U_FILE_SEP_CHAR; | |
272 | } | |
273 | ||
274 | /* initialize */ | |
275 | pv=upvec_open(1, 10000); | |
276 | caseSensitive=uset_open(1, 0); /* empty set (start>end) */ | |
277 | ||
278 | /* process SpecialCasing.txt */ | |
279 | writeUCDFilename(basename, "SpecialCasing", suffix); | |
280 | parseSpecialCasing(filename, &errorCode); | |
281 | ||
282 | /* process CaseFolding.txt */ | |
283 | writeUCDFilename(basename, "CaseFolding", suffix); | |
284 | parseCaseFolding(filename, &errorCode); | |
285 | ||
286 | /* process additional properties files */ | |
287 | *basename=0; | |
288 | ||
289 | parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode); | |
290 | ||
291 | parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode); | |
292 | ||
293 | /* process UnicodeData.txt */ | |
294 | writeUCDFilename(basename, "UnicodeData", suffix); | |
295 | parseDB(filename, &errorCode); | |
296 | ||
297 | /* process parsed data */ | |
298 | makeCaseClosure(); | |
299 | ||
300 | makeExceptions(); | |
301 | ||
302 | if(U_SUCCESS(errorCode)) { | |
303 | /* write the properties data file */ | |
304 | generateData(destDir); | |
305 | } | |
306 | ||
307 | u_cleanup(); | |
308 | return errorCode; | |
309 | } | |
310 | ||
311 | U_CFUNC void | |
312 | writeUCDFilename(char *basename, const char *filename, const char *suffix) { | |
313 | int32_t length=(int32_t)uprv_strlen(filename); | |
314 | uprv_strcpy(basename, filename); | |
315 | if(suffix!=NULL) { | |
316 | basename[length++]='-'; | |
317 | uprv_strcpy(basename+length, suffix); | |
318 | length+=(int32_t)uprv_strlen(suffix); | |
319 | } | |
320 | uprv_strcpy(basename+length, ".txt"); | |
321 | } | |
322 | ||
323 | /* TODO: move to toolutil */ | |
324 | U_CFUNC UBool | |
325 | isToken(const char *token, const char *s) { | |
326 | const char *z; | |
327 | int32_t j; | |
328 | ||
329 | s=u_skipWhitespace(s); | |
330 | for(j=0;; ++j) { | |
331 | if(token[j]!=0) { | |
332 | if(s[j]!=token[j]) { | |
333 | break; | |
334 | } | |
335 | } else { | |
336 | z=u_skipWhitespace(s+j); | |
337 | if(*z==';' || *z==0) { | |
338 | return TRUE; | |
339 | } else { | |
340 | break; | |
341 | } | |
342 | } | |
343 | } | |
344 | ||
345 | return FALSE; | |
346 | } | |
347 | ||
348 | static int32_t | |
349 | getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { | |
350 | const char *t, *z; | |
351 | int32_t i, j; | |
352 | ||
353 | s=u_skipWhitespace(s); | |
354 | for(i=0; i<countTokens; ++i) { | |
355 | t=tokens[i]; | |
356 | if(t!=NULL) { | |
357 | for(j=0;; ++j) { | |
358 | if(t[j]!=0) { | |
359 | if(s[j]!=t[j]) { | |
360 | break; | |
361 | } | |
362 | } else { | |
363 | z=u_skipWhitespace(s+j); | |
364 | if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') { | |
365 | return i; | |
366 | } else { | |
367 | break; | |
368 | } | |
369 | } | |
370 | } | |
371 | } | |
372 | } | |
373 | return -1; | |
374 | } | |
375 | ||
376 | static void | |
377 | _set_addAll(USet *set, const UChar *s, int32_t length) { | |
378 | UChar32 c; | |
379 | int32_t i; | |
380 | ||
381 | /* needs length>=0 */ | |
382 | for(i=0; i<length; /* U16_NEXT advances i */) { | |
383 | U16_NEXT(s, i, length, c); | |
384 | uset_add(set, c); | |
385 | } | |
386 | } | |
387 | ||
388 | /* parser for SpecialCasing.txt --------------------------------------------- */ | |
389 | ||
390 | #define MAX_SPECIAL_CASING_COUNT 500 | |
391 | ||
392 | static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT]; | |
393 | static int32_t specialCasingCount=0; | |
394 | ||
395 | static void U_CALLCONV | |
396 | specialCasingLineFn(void *context, | |
397 | char *fields[][2], int32_t fieldCount, | |
398 | UErrorCode *pErrorCode) { | |
399 | char *end; | |
400 | ||
401 | /* get code point */ | |
402 | specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); | |
403 | end=(char *)u_skipWhitespace(end); | |
404 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
405 | fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); | |
406 | *pErrorCode=U_PARSE_ERROR; | |
407 | exit(U_PARSE_ERROR); | |
408 | } | |
409 | ||
410 | /* is this a complex mapping? */ | |
411 | if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { | |
412 | /* there is some condition text in the fifth field */ | |
413 | specialCasings[specialCasingCount].isComplex=TRUE; | |
414 | ||
415 | /* do not store any actual mappings for this */ | |
416 | specialCasings[specialCasingCount].lowerCase[0]=0; | |
417 | specialCasings[specialCasingCount].upperCase[0]=0; | |
418 | specialCasings[specialCasingCount].titleCase[0]=0; | |
419 | } else { | |
420 | /* just set the "complex" flag and get the case mappings */ | |
421 | specialCasings[specialCasingCount].isComplex=FALSE; | |
422 | specialCasings[specialCasingCount].lowerCase[0]= | |
423 | (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); | |
424 | specialCasings[specialCasingCount].upperCase[0]= | |
425 | (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); | |
426 | specialCasings[specialCasingCount].titleCase[0]= | |
427 | (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); | |
428 | if(U_FAILURE(*pErrorCode)) { | |
429 | fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]); | |
430 | exit(*pErrorCode); | |
431 | } | |
432 | ||
433 | uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); | |
434 | _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); | |
435 | _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); | |
436 | _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); | |
437 | } | |
438 | ||
439 | if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { | |
440 | fprintf(stderr, "gencase: too many special casing mappings\n"); | |
441 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
442 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
443 | } | |
444 | } | |
445 | ||
446 | static int32_t U_CALLCONV | |
447 | compareSpecialCasings(const void *context, const void *left, const void *right) { | |
448 | return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code; | |
449 | } | |
450 | ||
451 | static void | |
452 | parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { | |
453 | char *fields[5][2]; | |
454 | int32_t i, j; | |
455 | ||
456 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
457 | return; | |
458 | } | |
459 | ||
460 | u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); | |
461 | ||
462 | /* sort the special casing entries by code point */ | |
463 | if(specialCasingCount>0) { | |
464 | uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), | |
465 | compareSpecialCasings, NULL, FALSE, pErrorCode); | |
466 | } | |
467 | if(U_FAILURE(*pErrorCode)) { | |
468 | return; | |
469 | } | |
470 | ||
471 | /* replace multiple entries for any code point by one "complex" one */ | |
472 | j=0; | |
473 | for(i=1; i<specialCasingCount; ++i) { | |
474 | if(specialCasings[i-1].code==specialCasings[i].code) { | |
475 | /* there is a duplicate code point */ | |
476 | specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */ | |
477 | specialCasings[i].isComplex=TRUE; /* make the following one complex */ | |
478 | specialCasings[i].lowerCase[0]=0; | |
479 | specialCasings[i].upperCase[0]=0; | |
480 | specialCasings[i].titleCase[0]=0; | |
481 | ++j; | |
482 | } | |
483 | } | |
484 | ||
485 | /* if some entries just were removed, then re-sort */ | |
486 | if(j>0) { | |
487 | uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), | |
488 | compareSpecialCasings, NULL, FALSE, pErrorCode); | |
489 | specialCasingCount-=j; | |
490 | } | |
491 | if(U_FAILURE(*pErrorCode)) { | |
492 | return; | |
493 | } | |
494 | ||
495 | /* | |
496 | * Add one complex mapping to caseSensitive that was filtered out above: | |
497 | * Greek final Sigma has a conditional mapping but not locale-sensitive, | |
498 | * and it is taken when lowercasing just U+03A3 alone. | |
499 | * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA | |
500 | */ | |
501 | uset_add(caseSensitive, 0x3c2); | |
502 | } | |
503 | ||
504 | /* parser for CaseFolding.txt ----------------------------------------------- */ | |
505 | ||
506 | #define MAX_CASE_FOLDING_COUNT 2000 | |
507 | ||
508 | static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; | |
509 | static int32_t caseFoldingCount=0; | |
510 | ||
511 | static void U_CALLCONV | |
512 | caseFoldingLineFn(void *context, | |
513 | char *fields[][2], int32_t fieldCount, | |
514 | UErrorCode *pErrorCode) { | |
515 | char *end; | |
516 | static UChar32 prevCode=0; | |
517 | int32_t count; | |
518 | char status; | |
519 | ||
520 | /* get code point */ | |
521 | caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); | |
522 | end=(char *)u_skipWhitespace(end); | |
523 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
524 | fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); | |
525 | *pErrorCode=U_PARSE_ERROR; | |
526 | exit(U_PARSE_ERROR); | |
527 | } | |
528 | ||
529 | /* get the status of this mapping */ | |
530 | caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); | |
531 | if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { | |
532 | fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); | |
533 | *pErrorCode=U_PARSE_ERROR; | |
534 | exit(U_PARSE_ERROR); | |
535 | } | |
536 | ||
537 | /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ | |
538 | if(status=='L') { | |
539 | return; | |
540 | } | |
541 | ||
542 | /* get the mapping */ | |
543 | count=caseFoldings[caseFoldingCount].full[0]= | |
544 | (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); | |
545 | if(U_FAILURE(*pErrorCode)) { | |
546 | fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); | |
547 | exit(*pErrorCode); | |
548 | } | |
549 | ||
550 | /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ | |
551 | if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { | |
552 | caseFoldings[caseFoldingCount].simple=0; | |
553 | } | |
554 | ||
555 | /* update the case-sensitive set */ | |
556 | if(status!='T') { | |
557 | uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); | |
558 | _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); | |
559 | } | |
560 | ||
561 | /* check the status */ | |
562 | if(status=='S') { | |
563 | /* check if there was a full mapping for this code point before */ | |
564 | if( caseFoldingCount>0 && | |
565 | caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && | |
566 | caseFoldings[caseFoldingCount-1].status=='F' | |
567 | ) { | |
568 | /* merge the two entries */ | |
569 | caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; | |
570 | return; | |
571 | } | |
572 | } else if(status=='F') { | |
573 | /* check if there was a simple mapping for this code point before */ | |
574 | if( caseFoldingCount>0 && | |
575 | caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && | |
576 | caseFoldings[caseFoldingCount-1].status=='S' | |
577 | ) { | |
578 | /* merge the two entries */ | |
579 | uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); | |
580 | return; | |
581 | } | |
582 | } else if(status=='I' || status=='T') { | |
583 | /* check if there was a default mapping for this code point before (remove it) */ | |
584 | while(caseFoldingCount>0 && | |
585 | caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code | |
586 | ) { | |
587 | prevCode=0; | |
588 | --caseFoldingCount; | |
589 | } | |
590 | /* store only a marker for special handling for cases like dotless i */ | |
591 | caseFoldings[caseFoldingCount].simple=0; | |
592 | caseFoldings[caseFoldingCount].full[0]=0; | |
593 | } | |
594 | ||
595 | /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ | |
596 | if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { | |
597 | fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", | |
598 | (unsigned long)caseFoldings[caseFoldingCount].code, | |
599 | (unsigned long)prevCode); | |
600 | *pErrorCode=U_PARSE_ERROR; | |
601 | exit(U_PARSE_ERROR); | |
602 | } | |
603 | prevCode=caseFoldings[caseFoldingCount].code; | |
604 | ||
605 | if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { | |
606 | fprintf(stderr, "gencase: too many case folding mappings\n"); | |
607 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
608 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
609 | } | |
610 | } | |
611 | ||
612 | static void | |
613 | parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { | |
614 | char *fields[3][2]; | |
615 | ||
616 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
617 | return; | |
618 | } | |
619 | ||
620 | u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); | |
621 | } | |
622 | ||
623 | /* parser for UnicodeData.txt ----------------------------------------------- */ | |
624 | ||
625 | /* general categories */ | |
626 | const char *const | |
627 | genCategoryNames[U_CHAR_CATEGORY_COUNT]={ | |
628 | "Cn", | |
629 | "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", | |
630 | "Mc", "Nd", "Nl", "No", | |
631 | "Zs", "Zl", "Zp", | |
632 | "Cc", "Cf", "Co", "Cs", | |
633 | "Pd", "Ps", "Pe", "Pc", "Po", | |
634 | "Sm", "Sc", "Sk", "So", | |
635 | "Pi", "Pf" | |
636 | }; | |
637 | ||
638 | static int32_t specialCasingIndex=0, caseFoldingIndex=0; | |
639 | ||
640 | static void U_CALLCONV | |
641 | unicodeDataLineFn(void *context, | |
642 | char *fields[][2], int32_t fieldCount, | |
643 | UErrorCode *pErrorCode) { | |
644 | Props p; | |
645 | char *end; | |
646 | static UChar32 prevCode=0; | |
647 | UChar32 value; | |
648 | int32_t i; | |
649 | ||
650 | /* reset the properties */ | |
651 | uprv_memset(&p, 0, sizeof(Props)); | |
652 | ||
653 | /* get the character code, field 0 */ | |
654 | p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); | |
655 | if(end<=fields[0][0] || end!=fields[0][1]) { | |
656 | fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); | |
657 | *pErrorCode=U_PARSE_ERROR; | |
658 | exit(U_PARSE_ERROR); | |
659 | } | |
660 | ||
661 | /* get general category, field 2 */ | |
662 | i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); | |
663 | if(i>=0) { | |
664 | p.gc=(uint8_t)i; | |
665 | } else { | |
666 | fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n", | |
667 | fields[2][0], (unsigned long)p.code); | |
668 | *pErrorCode=U_PARSE_ERROR; | |
669 | exit(U_PARSE_ERROR); | |
670 | } | |
671 | ||
672 | /* get canonical combining class, field 3 */ | |
673 | value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); | |
674 | if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { | |
675 | fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); | |
676 | *pErrorCode=U_PARSE_ERROR; | |
677 | exit(U_PARSE_ERROR); | |
678 | } | |
679 | p.cc=(uint8_t)value; | |
680 | ||
681 | /* get uppercase mapping, field 12 */ | |
682 | value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); | |
683 | if(end!=fields[12][1]) { | |
684 | fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", | |
685 | (unsigned long)p.code); | |
686 | *pErrorCode=U_PARSE_ERROR; | |
687 | exit(U_PARSE_ERROR); | |
688 | } | |
689 | if(value!=0 && value!=p.code) { | |
690 | p.upperCase=value; | |
691 | uset_add(caseSensitive, p.code); | |
692 | uset_add(caseSensitive, value); | |
693 | } | |
694 | ||
695 | /* get lowercase value, field 13 */ | |
696 | value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); | |
697 | if(end!=fields[13][1]) { | |
698 | fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", | |
699 | (unsigned long)p.code); | |
700 | *pErrorCode=U_PARSE_ERROR; | |
701 | exit(U_PARSE_ERROR); | |
702 | } | |
703 | if(value!=0 && value!=p.code) { | |
704 | p.lowerCase=value; | |
705 | uset_add(caseSensitive, p.code); | |
706 | uset_add(caseSensitive, value); | |
707 | } | |
708 | ||
709 | /* get titlecase value, field 14 */ | |
710 | value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); | |
711 | if(end!=fields[14][1]) { | |
712 | fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", | |
713 | (unsigned long)p.code); | |
714 | *pErrorCode=U_PARSE_ERROR; | |
715 | exit(U_PARSE_ERROR); | |
716 | } | |
717 | if(value!=0 && value!=p.code) { | |
718 | p.titleCase=value; | |
719 | uset_add(caseSensitive, p.code); | |
720 | uset_add(caseSensitive, value); | |
721 | } | |
722 | ||
723 | /* set additional properties from previously parsed files */ | |
724 | if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) { | |
725 | p.specialCasing=specialCasings+specialCasingIndex++; | |
726 | } else { | |
727 | p.specialCasing=NULL; | |
728 | } | |
729 | if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) { | |
730 | p.caseFolding=caseFoldings+caseFoldingIndex++; | |
731 | ||
732 | /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */ | |
733 | if( p.caseFolding->status=='C' && | |
734 | p.caseFolding->simple==p.lowerCase | |
735 | ) { | |
736 | p.caseFolding=NULL; | |
737 | } | |
738 | } else { | |
739 | p.caseFolding=NULL; | |
740 | } | |
741 | ||
742 | /* check for non-character code points */ | |
743 | if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { | |
744 | fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", | |
745 | (unsigned long)p.code); | |
746 | *pErrorCode=U_PARSE_ERROR; | |
747 | exit(U_PARSE_ERROR); | |
748 | } | |
749 | ||
750 | /* check that the code points (p.code) are in ascending order */ | |
751 | if(p.code<=prevCode && p.code>0) { | |
752 | fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", | |
753 | (unsigned long)p.code, (unsigned long)prevCode); | |
754 | *pErrorCode=U_PARSE_ERROR; | |
755 | exit(U_PARSE_ERROR); | |
756 | } | |
757 | ||
758 | /* properties for a single code point */ | |
759 | setProps(&p); | |
760 | ||
761 | prevCode=p.code; | |
762 | } | |
763 | ||
764 | static void | |
765 | parseDB(const char *filename, UErrorCode *pErrorCode) { | |
766 | char *fields[15][2]; | |
767 | UChar32 start, end; | |
768 | int32_t i; | |
769 | ||
770 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
771 | return; | |
772 | } | |
773 | ||
774 | u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); | |
775 | ||
776 | /* are all sub-properties consumed? */ | |
777 | if(specialCasingIndex<specialCasingCount) { | |
778 | fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n"); | |
779 | *pErrorCode=U_PARSE_ERROR; | |
780 | exit(U_PARSE_ERROR); | |
781 | } | |
782 | if(caseFoldingIndex<caseFoldingCount) { | |
783 | fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n"); | |
784 | *pErrorCode=U_PARSE_ERROR; | |
785 | exit(U_PARSE_ERROR); | |
786 | } | |
787 | ||
788 | if(U_FAILURE(*pErrorCode)) { | |
789 | return; | |
790 | } | |
791 | ||
792 | for(i=0; | |
793 | 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode); | |
794 | ++i | |
795 | ) { | |
796 | addCaseSensitive(start, end); | |
797 | } | |
798 | if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) { | |
799 | *pErrorCode=U_ZERO_ERROR; | |
800 | } | |
801 | } | |
802 | ||
803 | /* | |
804 | * Hey, Emacs, please set the following: | |
805 | * | |
806 | * Local Variables: | |
807 | * indent-tabs-mode: nil | |
808 | * End: | |
809 | * | |
810 | */ |