2 ********************************************************************************
4 * Copyright (C) 1998-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ********************************************************************************
11 * tool creating a binary (compressed) representation of the conversion mapping
12 * table (IBM NLTC ucmap format).
14 * 05/04/2000 helena Added fallback mapping into the picture...
15 * 06/29/2000 helena Major rewrite of the callback APIs.
19 #include "unicode/putil.h"
20 #include "unicode/ucnv_err.h"
30 #include "unicode/udata.h"
40 typedef struct ConvData
{
42 NewConverter
*cnvData
, *extData
;
43 UConverterSharedData sharedData
;
44 UConverterStaticData staticData
;
48 initConvData(ConvData
*data
) {
49 uprv_memset(data
, 0, sizeof(ConvData
));
50 data
->sharedData
.structSize
=sizeof(UConverterSharedData
);
51 data
->staticData
.structSize
=sizeof(UConverterStaticData
);
52 data
->sharedData
.staticData
=&data
->staticData
;
56 cleanupConvData(ConvData
*data
) {
58 if(data
->cnvData
!=NULL
) {
59 data
->cnvData
->close(data
->cnvData
);
62 if(data
->extData
!=NULL
) {
63 data
->extData
->close(data
->extData
);
72 * from ucnvstat.c - static prototypes of data-based converters
74 extern const UConverterStaticData
* ucnv_converterStaticData
[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES
];
79 UBool VERBOSE
= FALSE
;
82 createConverter(ConvData
*data
, const char* converterName
, UErrorCode
*pErrorCode
);
85 * Set up the UNewData and write the converter..
88 writeConverterData(ConvData
*data
, const char *cnvName
, const char *cnvDir
, UErrorCode
*status
);
90 UBool haveCopyright
=TRUE
;
92 static UDataInfo dataInfo
={
101 {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */
102 {6, 2, 0, 0}, /* formatVersion */
103 {0, 0, 0, 0} /* dataVersion (calculated at runtime) */
107 writeConverterData(ConvData
*data
, const char *cnvName
, const char *cnvDir
, UErrorCode
*status
)
109 UNewDataMemory
*mem
= NULL
;
114 if(U_FAILURE(*status
))
119 tableType
=TABLE_NONE
;
120 if(data
->cnvData
!=NULL
) {
121 tableType
|=TABLE_BASE
;
123 if(data
->extData
!=NULL
) {
124 tableType
|=TABLE_EXT
;
127 mem
= udata_create(cnvDir
, "cnv", cnvName
, &dataInfo
, haveCopyright
? U_COPYRIGHT_STRING
: NULL
, status
);
129 if(U_FAILURE(*status
))
131 fprintf(stderr
, "Couldn't create the udata %s.%s: %s\n",
134 u_errorName(*status
));
140 fprintf(stderr
, "- Opened udata %s.%s\n", cnvName
, "cnv");
144 /* all read only, clean, platform independent data. Mmmm. :) */
145 udata_writeBlock(mem
, &data
->staticData
, sizeof(UConverterStaticData
));
146 size
+= sizeof(UConverterStaticData
); /* Is 4-aligned - by size */
147 /* Now, write the table */
148 if(tableType
&TABLE_BASE
) {
149 size
+= data
->cnvData
->write(data
->cnvData
, &data
->staticData
, mem
, tableType
);
151 if(tableType
&TABLE_EXT
) {
152 size
+= data
->extData
->write(data
->extData
, &data
->staticData
, mem
, tableType
);
155 sz2
= udata_finish(mem
, status
);
158 fprintf(stderr
, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2
, (int)size
);
159 *status
=U_INTERNAL_PROGRAM_ERROR
;
163 fprintf(stderr
, "- Wrote %u bytes to the udata.\n", (int)sz2
);
167 static UOption options
[]={
168 UOPTION_HELP_H
, /* 0 Numbers for those who*/
169 UOPTION_HELP_QUESTION_MARK
, /* 1 can't count. */
170 UOPTION_COPYRIGHT
, /* 2 */
171 UOPTION_VERSION
, /* 3 */
172 UOPTION_DESTDIR
, /* 4 */
173 UOPTION_VERBOSE
, /* 5 */
176 int main(int argc
, char* argv
[])
179 UErrorCode err
= U_ZERO_ERROR
, localError
;
180 char outFileName
[UCNV_MAX_FULL_FILE_NAME_LENGTH
];
181 const char* destdir
, *arg
;
183 char* dot
= NULL
, *outBasename
;
184 char cnvName
[UCNV_MAX_FULL_FILE_NAME_LENGTH
];
185 char cnvNameWithPkg
[UCNV_MAX_FULL_FILE_NAME_LENGTH
];
186 UVersionInfo icuVersion
;
191 U_MAIN_INIT_ARGS(argc
, argv
);
193 /* Set up the ICU version number */
194 u_getVersion(icuVersion
);
195 uprv_memcpy(&dataInfo
.dataVersion
, &icuVersion
, sizeof(UVersionInfo
));
197 /* preset then read command line options */
198 options
[4].value
=u_getDataDirectory();
199 argc
=u_parseArgs(argc
, argv
, sizeof(options
)/sizeof(options
[0]), options
);
201 /* error handling, printing usage message */
204 "error in command line argument \"%s\"\n",
209 if(argc
<0 || options
[0].doesOccur
|| options
[1].doesOccur
) {
211 "usage: %s [-options] files...\n"
212 "\tread .ucm codepage mapping files and write .cnv files\n"
214 "\t-h or -? or --help this usage text\n"
215 "\t-V or --version show a version message\n"
216 "\t-c or --copyright include a copyright notice\n"
217 "\t-d or --destdir destination directory, followed by the path\n"
218 "\t-v or --verbose Turn on verbose output\n",
220 return argc
<0 ? U_ILLEGAL_ARGUMENT_ERROR
: U_ZERO_ERROR
;
223 if(options
[3].doesOccur
) {
224 fprintf(stderr
,"makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
225 dataInfo
.formatVersion
[0], dataInfo
.formatVersion
[1]);
226 fprintf(stderr
, U_COPYRIGHT_STRING
"\n");
230 /* get the options values */
231 haveCopyright
= options
[2].doesOccur
;
232 destdir
= options
[4].value
;
233 VERBOSE
= options
[5].doesOccur
;
235 if (destdir
!= NULL
&& *destdir
!= 0) {
236 uprv_strcpy(outFileName
, destdir
);
237 destdirlen
= uprv_strlen(destdir
);
238 outBasename
= outFileName
+ destdirlen
;
239 if (*(outBasename
- 1) != U_FILE_SEP_CHAR
) {
240 *outBasename
++ = U_FILE_SEP_CHAR
;
245 outBasename
= outFileName
;
251 printf("makeconv: processing %d files...\n", argc
- 1);
252 for(i
=1; i
<argc
; ++i
) {
253 printf("%s ", argv
[i
]);
261 printFilename
= (UBool
) (argc
> 2 || VERBOSE
);
262 for (++argv
; --argc
; ++argv
)
264 arg
= getLongPathname(*argv
);
266 /*produces the right destination path for display*/
269 const char *basename
;
271 /* find the last file sepator */
272 basename
= findBasename(arg
);
273 uprv_strcpy(outBasename
, basename
);
277 uprv_strcpy(outFileName
, arg
);
280 /*removes the extension if any is found*/
281 dot
= uprv_strrchr(outBasename
, '.');
287 /* the basename without extension is the converter name */
288 uprv_strcpy(cnvName
, outBasename
);
290 /*Adds the target extension*/
291 uprv_strcat(outBasename
, CONVERTER_FILE_EXTENSION
);
294 printf("makeconv: processing %s ...\n", arg
);
297 localError
= U_ZERO_ERROR
;
299 createConverter(&data
, arg
, &localError
);
301 if (U_FAILURE(localError
))
303 /* if an error is found, print out an error msg and keep going */
304 fprintf(stderr
, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", outFileName
, arg
,
305 u_errorName(localError
));
312 /* Make the static data name equal to the file name */
313 if( /*VERBOSE && */ uprv_stricmp(cnvName
,data
.staticData
.name
))
315 fprintf(stderr
, "Warning: %s%s claims to be '%s'\n",
317 CONVERTER_FILE_EXTENSION
,
318 data
.staticData
.name
);
321 uprv_strcpy((char*)data
.staticData
.name
, cnvName
);
323 if(!uprv_isInvariantString((char*)data
.staticData
.name
, -1)) {
325 "Error: A converter name must contain only invariant characters.\n"
326 "%s is not a valid converter name.\n",
327 data
.staticData
.name
);
329 err
= U_INVALID_TABLE_FORMAT
;
333 uprv_strcpy(cnvNameWithPkg
, cnvName
);
335 localError
= U_ZERO_ERROR
;
336 writeConverterData(&data
, cnvNameWithPkg
, destdir
, &localError
);
338 if(U_FAILURE(localError
))
340 /* if an error is found, print out an error msg and keep going*/
341 fprintf(stderr
, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName
, arg
,
342 u_errorName(localError
));
347 else if (printFilename
)
355 cleanupConvData(&data
);
362 getPlatformAndCCSIDFromName(const char *name
, int8_t *pPlatform
, int32_t *pCCSID
) {
363 if( (name
[0]=='i' || name
[0]=='I') &&
364 (name
[1]=='b' || name
[1]=='B') &&
365 (name
[2]=='m' || name
[2]=='M')
372 *pCCSID
=(int32_t)uprv_strtoul(name
, NULL
, 10);
374 *pPlatform
=UCNV_UNKNOWN
;
380 readHeader(ConvData
*data
,
381 FileStream
* convFile
,
382 const char* converterName
,
383 UErrorCode
*pErrorCode
) {
385 char *s
, *key
, *value
;
386 const UConverterStaticData
*prototype
;
387 UConverterStaticData
*staticData
;
389 if(U_FAILURE(*pErrorCode
)) {
393 staticData
=&data
->staticData
;
394 staticData
->platform
=UCNV_IBM
;
395 staticData
->subCharLen
=0;
397 while(T_FileStream_readLine(convFile
, line
, sizeof(line
))) {
398 /* basic parsing and handling of state-related items */
399 if(ucm_parseHeaderLine(data
->ucm
, line
, &key
, &value
)) {
403 /* stop at the beginning of the mapping section */
404 if(uprv_strcmp(line
, "CHARMAP")==0) {
408 /* collect the information from the header field, ignore unknown keys */
409 if(uprv_strcmp(key
, "code_set_name")==0) {
411 uprv_strcpy((char *)staticData
->name
, value
);
412 getPlatformAndCCSIDFromName(value
, &staticData
->platform
, &staticData
->codepage
);
414 } else if(uprv_strcmp(key
, "subchar")==0) {
415 uint8_t bytes
[UCNV_EXT_MAX_BYTES
];
419 length
=ucm_parseBytes(bytes
, line
, (const char **)&s
);
420 if(1<=length
&& length
<=4 && *s
==0) {
421 staticData
->subCharLen
=length
;
422 uprv_memcpy(staticData
->subChar
, bytes
, length
);
424 fprintf(stderr
, "error: illegal <subchar> %s\n", value
);
425 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
428 } else if(uprv_strcmp(key
, "subchar1")==0) {
429 uint8_t bytes
[UCNV_EXT_MAX_BYTES
];
432 if(1==ucm_parseBytes(bytes
, line
, (const char **)&s
) && *s
==0) {
433 staticData
->subChar1
=bytes
[0];
435 fprintf(stderr
, "error: illegal <subchar1> %s\n", value
);
436 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
442 /* copy values from the UCMFile to the static data */
443 staticData
->maxBytesPerChar
=(int8_t)data
->ucm
->states
.maxCharLength
;
444 staticData
->minBytesPerChar
=(int8_t)data
->ucm
->states
.minCharLength
;
445 staticData
->conversionType
=data
->ucm
->states
.conversionType
;
447 if(staticData
->conversionType
==UCNV_UNSUPPORTED_CONVERTER
) {
448 fprintf(stderr
, "ucm error: missing conversion type (<uconv_class>)\n");
449 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
454 * Now that we know the type, copy any 'default' values from the table.
455 * We need not check the type any further because the parser only
456 * recognizes what we have prototypes for.
458 * For delta (extension-only) tables, copy values from the base file
459 * instead, see createConverter().
461 if(data
->ucm
->baseName
[0]==0) {
462 prototype
=ucnv_converterStaticData
[staticData
->conversionType
];
463 if(prototype
!=NULL
) {
464 if(staticData
->name
[0]==0) {
465 uprv_strcpy((char *)staticData
->name
, prototype
->name
);
468 if(staticData
->codepage
==0) {
469 staticData
->codepage
=prototype
->codepage
;
472 if(staticData
->platform
==0) {
473 staticData
->platform
=prototype
->platform
;
476 if(staticData
->minBytesPerChar
==0) {
477 staticData
->minBytesPerChar
=prototype
->minBytesPerChar
;
480 if(staticData
->maxBytesPerChar
==0) {
481 staticData
->maxBytesPerChar
=prototype
->maxBytesPerChar
;
484 if(staticData
->subCharLen
==0) {
485 staticData
->subCharLen
=prototype
->subCharLen
;
486 if(prototype
->subCharLen
>0) {
487 uprv_memcpy(staticData
->subChar
, prototype
->subChar
, prototype
->subCharLen
);
493 if(data
->ucm
->states
.outputType
<0) {
494 data
->ucm
->states
.outputType
=(int8_t)data
->ucm
->states
.maxCharLength
-1;
497 if( staticData
->subChar1
!=0 &&
498 (staticData
->minBytesPerChar
>1 ||
499 (staticData
->conversionType
!=UCNV_MBCS
&&
500 staticData
->conversionType
!=UCNV_EBCDIC_STATEFUL
))
502 fprintf(stderr
, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
503 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
507 /* return TRUE if a base table was read, FALSE for an extension table */
509 readFile(ConvData
*data
, const char* converterName
,
510 UErrorCode
*pErrorCode
) {
513 FileStream
*convFile
;
515 UCMStates
*baseStates
;
518 if(U_FAILURE(*pErrorCode
)) {
522 data
->ucm
=ucm_open();
524 convFile
=T_FileStream_open(converterName
, "r");
526 *pErrorCode
=U_FILE_ACCESS_ERROR
;
530 readHeader(data
, convFile
, converterName
, pErrorCode
);
531 if(U_FAILURE(*pErrorCode
)) {
535 if(data
->ucm
->baseName
[0]==0) {
537 baseStates
=&data
->ucm
->states
;
538 ucm_processStates(baseStates
);
544 /* read the base table */
545 ucm_readTable(data
->ucm
, convFile
, dataIsBase
, baseStates
, pErrorCode
);
546 if(U_FAILURE(*pErrorCode
)) {
550 /* read an extension table if there is one */
551 while(T_FileStream_readLine(convFile
, line
, sizeof(line
))) {
552 end
=uprv_strchr(line
, 0);
554 (*(end
-1)=='\n' || *(end
-1)=='\r' || *(end
-1)==' ' || *(end
-1)=='\t')) {
559 if(line
[0]=='#' || u_skipWhitespace(line
)==end
) {
560 continue; /* ignore empty and comment lines */
563 if(0==uprv_strcmp(line
, "CHARMAP")) {
564 /* read the extension table */
565 ucm_readTable(data
->ucm
, convFile
, FALSE
, baseStates
, pErrorCode
);
567 fprintf(stderr
, "unexpected text after the base mapping table\n");
572 T_FileStream_close(convFile
);
574 if(data
->ucm
->base
->flagsType
==UCM_FLAGS_MIXED
|| data
->ucm
->ext
->flagsType
==UCM_FLAGS_MIXED
) {
575 fprintf(stderr
, "error: some entries have the mapping precision (with '|'), some do not\n");
576 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
583 createConverter(ConvData
*data
, const char *converterName
, UErrorCode
*pErrorCode
) {
587 UConverterStaticData
*staticData
;
588 UCMStates
*states
, *baseStates
;
590 if(U_FAILURE(*pErrorCode
)) {
596 dataIsBase
=readFile(data
, converterName
, pErrorCode
);
597 if(U_FAILURE(*pErrorCode
)) {
601 staticData
=&data
->staticData
;
602 states
=&data
->ucm
->states
;
605 data
->cnvData
=MBCSOpen(data
->ucm
);
606 if(data
->cnvData
==NULL
) {
607 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
609 } else if(!data
->cnvData
->isValid(data
->cnvData
,
610 staticData
->subChar
, staticData
->subCharLen
)
612 fprintf(stderr
, " the substitution character byte sequence is illegal in this codepage structure!\n");
613 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
615 } else if(staticData
->subChar1
!=0 &&
616 !data
->cnvData
->isValid(data
->cnvData
, &staticData
->subChar1
, 1)
618 fprintf(stderr
, " the subchar1 byte is illegal in this codepage structure!\n");
619 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
621 } else if(data
->ucm
->ext
->mappingsLength
>0) {
622 /* prepare the extension table, if there is one */
623 data
->extData
=CnvExtOpen(data
->ucm
);
624 if(data
->extData
==NULL
) {
625 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
628 !ucm_checkBaseExt(states
, data
->ucm
->base
, data
->ucm
->ext
, data
->ucm
->ext
, FALSE
) ||
629 !data
->extData
->addTable(data
->extData
, data
->ucm
->ext
, &data
->staticData
)
631 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
635 /* add the base table after ucm_checkBaseExt()! */
636 if( U_SUCCESS(*pErrorCode
) &&
637 !data
->cnvData
->addTable(data
->cnvData
, data
->ucm
->base
, &data
->staticData
)
639 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
642 char baseFilename
[500];
645 initConvData(&baseData
);
647 /* assemble a path/filename for data->ucm->baseName */
648 uprv_strcpy(baseFilename
, converterName
);
649 basename
=(char *)findBasename(baseFilename
);
650 uprv_strcpy(basename
, data
->ucm
->baseName
);
651 uprv_strcat(basename
, ".ucm");
653 /* read the base table */
654 dataIsBase
=readFile(&baseData
, baseFilename
, pErrorCode
);
655 if(U_FAILURE(*pErrorCode
)) {
657 } else if(!dataIsBase
) {
658 fprintf(stderr
, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename
);
659 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
661 /* prepare the extension table */
662 data
->extData
=CnvExtOpen(data
->ucm
);
663 if(data
->extData
==NULL
) {
664 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
667 /* fill in gaps in extension file header fields */
668 UCMapping
*m
, *mLimit
;
669 uint8_t fallbackFlags
;
671 baseStates
=&baseData
.ucm
->states
;
672 if(states
->conversionType
==UCNV_DBCS
) {
673 staticData
->minBytesPerChar
=(int8_t)(states
->minCharLength
=2);
674 } else if(states
->minCharLength
==0) {
675 staticData
->minBytesPerChar
=(int8_t)(states
->minCharLength
=baseStates
->minCharLength
);
677 if(states
->maxCharLength
<states
->minCharLength
) {
678 staticData
->maxBytesPerChar
=(int8_t)(states
->maxCharLength
=baseStates
->maxCharLength
);
681 if(staticData
->subCharLen
==0) {
682 uprv_memcpy(staticData
->subChar
, baseData
.staticData
.subChar
, 4);
683 staticData
->subCharLen
=baseData
.staticData
.subCharLen
;
686 * do not copy subChar1 -
687 * only use what is explicitly specified
688 * because it cannot be unset in the extension file header
691 /* get the fallback flags */
693 for(m
=baseData
.ucm
->base
->mappings
, mLimit
=m
+baseData
.ucm
->base
->mappingsLength
;
694 m
<mLimit
&& fallbackFlags
!=3;
703 for(m
=data
->ucm
->base
->mappings
, mLimit
=m
+data
->ucm
->base
->mappingsLength
;
704 m
<mLimit
&& fallbackFlags
!=3;
714 if(fallbackFlags
&1) {
715 staticData
->hasFromUnicodeFallback
=TRUE
;
717 if(fallbackFlags
&2) {
718 staticData
->hasToUnicodeFallback
=TRUE
;
721 if(1!=ucm_countChars(baseStates
, staticData
->subChar
, staticData
->subCharLen
)) {
722 fprintf(stderr
, " the substitution character byte sequence is illegal in this codepage structure!\n");
723 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
725 } else if(1!=ucm_countChars(baseStates
, &staticData
->subChar1
, 1)) {
726 fprintf(stderr
, " the subchar1 byte is illegal in this codepage structure!\n");
727 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
730 !ucm_checkValidity(data
->ucm
->ext
, baseStates
) ||
731 !ucm_checkBaseExt(baseStates
, baseData
.ucm
->base
, data
->ucm
->ext
, data
->ucm
->ext
, FALSE
) ||
732 !data
->extData
->addTable(data
->extData
, data
->ucm
->ext
, &data
->staticData
)
734 *pErrorCode
=U_INVALID_TABLE_FORMAT
;
739 cleanupConvData(&baseData
);
744 * Hey, Emacs, please set the following:
747 * indent-tabs-mode: nil