]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gennorm/gennorm.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / tools / gennorm / gennorm.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 2001-2003, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: gennorm.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2001may25
14* created by: Markus W. Scherer
15*
16* This program reads the Unicode character database text file,
17* parses it, and extracts the data for normalization.
18* It then preprocesses it and writes a binary file for efficient use
19* in various Unicode text normalization processes.
20*/
21
22#include <stdio.h>
23#include <stdlib.h>
24#include "unicode/utypes.h"
25#include "unicode/uchar.h"
26#include "unicode/putil.h"
27#include "cmemory.h"
28#include "cstring.h"
29#include "unicode/udata.h"
30#include "unewdata.h"
31#include "uoptions.h"
32#include "uparse.h"
33#include "unormimp.h"
34
35U_CDECL_BEGIN
36#include "gennorm.h"
37U_CDECL_END
38
39#ifdef WIN32
40# pragma warning(disable: 4100)
41#endif
42
43UBool beVerbose=FALSE, haveCopyright=TRUE;
44
45/* prototypes --------------------------------------------------------------- */
46
47static void
48parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);
49
50static void
51parseDB(const char *filename, UErrorCode *pErrorCode);
52
53/* -------------------------------------------------------------------------- */
54
55static UOption options[]={
56 UOPTION_HELP_H,
57 UOPTION_HELP_QUESTION_MARK,
58 UOPTION_VERBOSE,
59 UOPTION_COPYRIGHT,
60 UOPTION_DESTDIR,
61 UOPTION_SOURCEDIR,
62 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }
63};
64
65extern int
66main(int argc, char* argv[]) {
67#if !UCONFIG_NO_NORMALIZATION
68 char filename[300];
69#endif
70 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
71 char *basename=NULL;
72 UErrorCode errorCode=U_ZERO_ERROR;
73
74 U_MAIN_INIT_ARGS(argc, argv);
75
76 /* preset then read command line options */
77 options[4].value=u_getDataDirectory();
78 options[5].value="";
79 options[6].value="3.0.0";
80 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
81
82 /* error handling, printing usage message */
83 if(argc<0) {
84 fprintf(stderr,
85 "error in command line argument \"%s\"\n",
86 argv[-argc]);
87 }
88 if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
89 /*
90 * Broken into chucks because the C89 standard says the minimum
91 * required supported string length is 509 bytes.
92 */
93 fprintf(stderr,
94 "Usage: %s [-options] [suffix]\n"
95 "\n"
96 "Read the UnicodeData.txt file and other Unicode properties files and\n"
97 "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
98 "\n",
99 argv[0]);
100 fprintf(stderr,
101 "Options:\n"
102 "\t-h or -? or --help this usage text\n"
103 "\t-v or --verbose verbose output\n"
104 "\t-c or --copyright include a copyright notice\n"
105 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
106 fprintf(stderr,
107 "\t-d or --destdir destination directory, followed by the path\n"
108 "\t-s or --sourcedir source directory, followed by the path\n"
109 "\tsuffix suffix that is to be appended with a '-'\n"
110 "\t to the source file basenames before opening;\n"
111 "\t 'gennorm new' will read UnicodeData-new.txt etc.\n");
112 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
113 }
114
115 /* get the options values */
116 beVerbose=options[2].doesOccur;
117 haveCopyright=options[3].doesOccur;
118 srcDir=options[5].value;
119 destDir=options[4].value;
120
121 if(argc>=2) {
122 suffix=argv[1];
123 } else {
124 suffix=NULL;
125 }
126
127#if UCONFIG_NO_NORMALIZATION
128
129 fprintf(stderr,
130 "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
131 " because UCONFIG_NO_NORMALIZATION is set, \n"
132 "see icu/source/common/unicode/uconfig.h\n");
133 generateData(destDir);
134
135#else
136
137 setUnicodeVersion(options[6].value);
138
139 /* prepare the filename beginning with the source dir */
140 uprv_strcpy(filename, srcDir);
141 basename=filename+uprv_strlen(filename);
142 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
143 *basename++=U_FILE_SEP_CHAR;
144 }
145
146 /* initialize */
147 init();
148
149 /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
150 if(suffix==NULL) {
151 uprv_strcpy(basename, "DerivedNormalizationProps.txt");
152 } else {
153 uprv_strcpy(basename, "DerivedNormalizationProps");
154 basename[30]='-';
155 uprv_strcpy(basename+31, suffix);
156 uprv_strcat(basename+31, ".txt");
157 }
158 parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
159 if(U_FAILURE(errorCode)) {
160 /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
161 if(suffix==NULL) {
162 uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
163 } else {
164 uprv_strcpy(basename, "DerivedNormalizationProperties");
165 basename[30]='-';
166 uprv_strcpy(basename+31, suffix);
167 uprv_strcat(basename+31, ".txt");
168 }
169 parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
170 }
171
172 /* process UnicodeData.txt */
173 if(suffix==NULL) {
174 uprv_strcpy(basename, "UnicodeData.txt");
175 } else {
176 uprv_strcpy(basename, "UnicodeData");
177 basename[11]='-';
178 uprv_strcpy(basename+12, suffix);
179 uprv_strcat(basename+12, ".txt");
180 }
181 parseDB(filename, &errorCode);
182
183 /* process parsed data */
184 if(U_SUCCESS(errorCode)) {
185 processData();
186
187 /* write the properties data file */
188 generateData(destDir);
189
190 cleanUpData();
191 }
192
193#endif
194
195 return errorCode;
196}
197
198#if !UCONFIG_NO_NORMALIZATION
199
200/* parser for DerivedNormalizationProperties.txt ---------------------------- */
201
202static void U_CALLCONV
203derivedNormalizationPropertiesLineFn(void *context,
204 char *fields[][2], int32_t fieldCount,
205 UErrorCode *pErrorCode) {
206 UChar string[32];
207 char *s;
208 uint32_t start, end;
209 int32_t count;
210 uint8_t qcFlags;
211
212 /* get code point range */
213 count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
214 if(U_FAILURE(*pErrorCode)) {
215 fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
216 exit(*pErrorCode);
217 }
218
219 /* ignore hangul - handle explicitly */
220 if(start==0xac00) {
221 return;
222 }
223
224 /* get property - ignore unrecognized ones */
225 s=(char *)u_skipWhitespace(fields[1][0]);
226 if(*s=='N' && s[1]=='F') {
227 /* quick check flag */
228 qcFlags=0x11;
229 s+=2;
230 if(*s=='K') {
231 qcFlags<<=1;
232 ++s;
233 }
234
235 if(*s=='C' && s[1]=='_') {
236 s+=2;
237 } else if(*s=='D' && s[1]=='_') {
238 qcFlags<<=2;
239 s+=2;
240 } else {
241 return;
242 }
243
244 if(0==uprv_memcmp(s, "NO", 2)) {
245 qcFlags&=0xf;
246 } else if(0==uprv_memcmp(s, "MAYBE", 5)) {
247 qcFlags&=0x30;
248 } else {
249 return;
250 }
251
252 /* set this flag for all code points in this range */
253 while(start<=end) {
254 setQCFlags(start++, qcFlags);
255 }
256 } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
257 /* full composition exclusion */
258 while(start<=end) {
259 setCompositionExclusion(start++);
260 }
261 } else if(0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') {
262 /* FC_NFKC_Closure, parse field 2 to get the string */
263 char *t;
264
265 /* start of the field */
266 s=(char *)u_skipWhitespace(s+1);
267
268 /* find the end of the field */
269 for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
270 *t=0;
271
272 string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
273 if(U_FAILURE(*pErrorCode)) {
274 fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
275 exit(*pErrorCode);
276 }
277 while(start<=end) {
278 setFNC(start++, string);
279 }
280 }
281}
282
283static void
284parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
285 char *fields[2][2];
286
287 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
288 return;
289 }
290
291 u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
292 if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
293 fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
294 exit(*pErrorCode);
295 }
296}
297
298/* parser for UnicodeData.txt ----------------------------------------------- */
299
300static void U_CALLCONV
301unicodeDataLineFn(void *context,
302 char *fields[][2], int32_t fieldCount,
303 UErrorCode *pErrorCode) {
304 uint32_t decomp[40];
305 Norm norm;
306 const char *s;
307 char *end;
308 uint32_t code, value;
309 int32_t length;
310 UBool isCompat, something=FALSE;
311
312 /* ignore First and Last entries for ranges */
313 if( *fields[1][0]=='<' &&
314 (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
315 (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
316 ) {
317 return;
318 }
319
320 /* reset the properties */
321 uprv_memset(&norm, 0, sizeof(Norm));
322
323 /* get the character code, field 0 */
324 code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
325 if(end<=fields[0][0] || end!=fields[0][1]) {
326 fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
327 *pErrorCode=U_PARSE_ERROR;
328 exit(U_PARSE_ERROR);
329 }
330
331 /* get canonical combining class, field 3 */
332 value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
333 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
334 fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
335 *pErrorCode=U_PARSE_ERROR;
336 exit(U_PARSE_ERROR);
337 }
338 if(value>0) {
339 norm.udataCC=(uint8_t)value;
340 something=TRUE;
341 }
342
343 /* get the decomposition, field 5 */
344 if(fields[5][0]<fields[5][1]) {
345 if(*(s=fields[5][0])=='<') {
346 ++s;
347 isCompat=TRUE;
348
349 /* skip and ignore the compatibility type name */
350 do {
351 if(s==fields[5][1]) {
352 /* missing '>' */
353 fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
354 *pErrorCode=U_PARSE_ERROR;
355 exit(U_PARSE_ERROR);
356 }
357 } while(*s++!='>');
358 } else {
359 isCompat=FALSE;
360 }
361
362 /* parse the decomposition string */
363 length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
364 if(U_FAILURE(*pErrorCode)) {
365 fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
366 (long)code, u_errorName(*pErrorCode));
367 exit(*pErrorCode);
368 }
369
370 /* store the string */
371 if(length>0) {
372 something=TRUE;
373 if(isCompat) {
374 norm.lenNFKD=(uint8_t)length;
375 norm.nfkd=decomp;
376 } else {
377 if(length>2) {
378 fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
379 (long)code, (long)length);
380 *pErrorCode=U_PARSE_ERROR;
381 exit(U_PARSE_ERROR);
382 }
383 norm.lenNFD=(uint8_t)length;
384 norm.nfd=decomp;
385 }
386 }
387 }
388
389 /* check for non-character code points */
390 if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
391 fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
392 (long)code);
393 *pErrorCode=U_PARSE_ERROR;
394 exit(U_PARSE_ERROR);
395 }
396
397 if(something) {
398 /* there are normalization values, so store them */
399#if 0
400 if(beVerbose) {
401 printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
402 (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
403 }
404#endif
405 storeNorm(code, &norm);
406 }
407}
408
409static void
410parseDB(const char *filename, UErrorCode *pErrorCode) {
411 char *fields[15][2];
412
413 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
414 return;
415 }
416
417 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
418 if(U_FAILURE(*pErrorCode)) {
419 fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
420 exit(*pErrorCode);
421 }
422}
423
424#endif /* #if !UCONFIG_NO_NORMALIZATION */
425
426/*
427 * Hey, Emacs, please set the following:
428 *
429 * Local Variables:
430 * indent-tabs-mode: nil
431 * End:
432 *
433 */