#include "unicode/utypes.h"
#include "n2builder.h"
+#include <fstream>
#include <stdio.h>
#include <stdlib.h>
+#include <string>
#include <string.h>
#include "unicode/errorcode.h"
#include "unicode/localpointer.h"
UBool beVerbose=FALSE, haveCopyright=TRUE;
-U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
-
#if !UCONFIG_NO_NORMALIZATION
-void parseFile(FILE *f, Normalizer2DataBuilder &builder);
+void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
#endif
/* -------------------------------------------------------------------------- */
OUTPUT_FILENAME,
UNICODE_VERSION,
WRITE_C_SOURCE,
+ WRITE_COMBINED_DATA,
OPT_FAST
};
UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
+ UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
};
if( argc<2 ||
options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
) {
- /*
- * Broken into chunks because the C89 standard says the minimum
- * required supported string length is 509 bytes.
- */
fprintf(stderr,
"Usage: %s [-options] infiles+ -o outputfilename\n"
"\n"
"Reads the infiles with normalization data and\n"
- "creates a binary or C source file (outputfilename) with the data.\n"
+ "creates a binary file, or a C source file (--csource), with the data,\n"
+ "or writes a data file with the combined data (--combined).\n"
+ "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
+ "\n"
+ "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
+ "\n"
+ "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
+ "in input-file syntax to the outputfilename.\n"
+ "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
+ "(Useful for computing minimal incremental mapping data files.)\n"
"\n",
- argv[0]);
+ argv[0], argv[0]);
fprintf(stderr,
"Options:\n"
"\t-h or -? or --help this usage text\n"
fprintf(stderr,
"\t-s or --sourcedir source directory, followed by the path\n"
"\t-o or --output output filename\n"
- "\t --csource writes a C source file with initializers\n");
+ "\t --csource writes a C source file with initializers\n"
+ "\t --combined writes a .txt file (input-file syntax) with the\n"
+ "\t combined data from all of the input files\n");
fprintf(stderr,
"\t --fast optimize the data for fast normalization,\n"
"\t which might increase its size (Writes fully decomposed\n"
#else
- LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
+ LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
+ LocalPointer<Normalizer2DataBuilder> b2;
+ LocalPointer<Normalizer2DataBuilder> diff;
+ Normalizer2DataBuilder *builder = b1.getAlias();
errorCode.assertSuccess();
if(options[UNICODE_VERSION].doesOccur) {
pathLength=filename.length();
}
+ bool doMinus = false;
for(int i=1; i<argc; ++i) {
printf("gennorm2: processing %s\n", argv[i]);
+ if(strcmp(argv[i], "minus") == 0) {
+ if(doMinus) {
+ fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ // Data from previous input files has been collected in b1.
+ // Collect data from further input files in b2.
+ b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+ diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+ errorCode.assertSuccess();
+ builder = b2.getAlias();
+ if(options[UNICODE_VERSION].doesOccur) {
+ builder->setUnicodeVersion(options[UNICODE_VERSION].value);
+ }
+ if(options[OPT_FAST].doesOccur) {
+ builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
+ }
+ doMinus = true;
+ continue;
+ }
filename.append(argv[i], errorCode);
- LocalStdioFilePointer f(fopen(filename.data(), "r"));
- if(f==NULL) {
+ std::ifstream f(filename.data());
+ if(f.fail()) {
fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
exit(U_FILE_ACCESS_ERROR);
}
builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
- parseFile(f.getAlias(), *builder);
+ parseFile(f, *builder);
filename.truncate(pathLength);
}
- if(options[WRITE_C_SOURCE].doesOccur) {
+ if(doMinus) {
+ Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
+ diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
+ } else if(options[WRITE_COMBINED_DATA].doesOccur) {
+ builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
+ } else if(options[WRITE_C_SOURCE].doesOccur) {
builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
} else {
builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
#if !UCONFIG_NO_NORMALIZATION
-void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
+void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
IcuToolErrorCode errorCode("gennorm2/parseFile()");
- char line[300];
+ std::string lineString;
uint32_t startCP, endCP;
- while(NULL!=fgets(line, (int)sizeof(line), f)) {
+ while(std::getline(f, lineString)) {
+ if (lineString.empty()) {
+ continue; // skip empty lines.
+ }
+#if (U_CPLUSPLUS_VERSION >= 11)
+ char *line = &lineString.front();
+#else
+ char *line = &lineString.at(0);
+#endif
char *comment=(char *)strchr(line, '#');
if(comment!=NULL) {
*comment=0;
fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
exit(errorCode.reset());
}
+ if (endCP >= 0xd800 && startCP <= 0xdfff) {
+ fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n",
+ line);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
delimiter=u_skipWhitespace(delimiter);
if(*delimiter==':') {
const char *s=u_skipWhitespace(delimiter+1);