+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
-* Copyright (C) 2002-2012, International Business Machines
+* Copyright (C) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/ucnv.h"
+#include "unicode/ustring.h"
#include "unicode/utf16.h"
#include "charstr.h"
#include <string.h>
#include "putilimp.h"
-UDate startTime = -1.0;
+UDate startTime;
static int elapsedTime() {
return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0);
}
-#if U_PLATFORM_IMPLEMENTS_POSIX && !U_PLATFORM_HAS_WIN32_API
-#include <signal.h>
-#include <unistd.h>
-
-const char *wToolname="gendict";
-const char *wOutname="(some file)";
-
-const int firstSeconds = 5; /* seconds between notices*/
-const int nextSeconds = 15; /* seconds between notices*/
-
-static void alarm_fn(int /*n*/) {
- printf("%s: still writing\t%s (%ds)\t...\n", wToolname, wOutname, elapsedTime());
-
- signal(SIGALRM, &alarm_fn);
- alarm(nextSeconds); // reset the alarm
-}
-
-static void install_watchdog(const char *toolName, const char *outFileName) {
- wToolname=toolName;
- wOutname=outFileName;
-
- if(startTime<0) { // uninitialized
- startTime = uprv_getRawUTCtime();
- }
- signal(SIGALRM, &alarm_fn);
-
- alarm(firstSeconds); // set the alarm
-}
-
-#else
-static void install_watchdog(const char*, const char*) {
- // not implemented
-}
-#endif
-
-
-
-
U_NAMESPACE_USE
static char *progName;
{ "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */
{ "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */
{ "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
+ UOPTION_QUIET, /* 9 */
};
enum arguments {
ARG_COPYRIGHT,
ARG_UCHARS,
ARG_BYTES,
- ARG_TRANSFORM
+ ARG_TRANSFORM,
+ ARG_QUIET
};
// prints out the standard usage method describing command line arguments,
"\t-V or --version show a version message\n"
"\t-c or --copyright include a copyright notice\n"
"\t-v or --verbose turn on verbose output\n"
+ "\t-q or --quiet do not display warnings and progress\n"
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option
"\t followed by path, defaults to %s\n"
"\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n"
//
U_MAIN_INIT_ARGS(argc, argv);
progName = argv[0];
- argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
+ argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
if(argc<0) {
// Unrecognized option
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
}
UBool verbose = options[ARG_VERBOSE].doesOccur;
+ UBool quiet = options[ARG_QUIET].doesOccur;
if (argc < 3) {
fprintf(stderr, "input and output file must both be specified.\n");
const char *outFileName = argv[2];
const char *wordFileName = argv[1];
- // set up the watchdog
- install_watchdog(progName, outFileName);
+ startTime = uprv_getRawUTCtime(); // initialize start timer
- if (options[ARG_ICUDATADIR].doesOccur) {
+ if (options[ARG_ICUDATADIR].doesOccur) {
u_setDataDirectory(options[ARG_ICUDATADIR].value);
}
// Read in the dictionary source file
if (verbose) { printf("Opening file %s...\n", wordFileName); }
const char *codepage = "UTF-8";
- UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status);
+ LocalUCHARBUFPointer f(ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status));
if (status.isFailure()) {
fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName());
exit(status.reset());
UBool hasValues = FALSE;
UBool hasValuelessContents = FALSE;
int lineCount = 0;
+ int wordCount = 0;
+ int minlen = 255;
+ int maxlen = 0;
UBool isOk = TRUE;
- while (readLine(f, fileLine, status)) {
+ while (readLine(f.getAlias(), fileLine, status)) {
lineCount++;
if (fileLine.isEmpty()) continue;
-
+
// Parse word [spaces value].
int32_t keyLen;
for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {}
}
dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status);
hasValues = TRUE;
+ wordCount++;
+ if (keyLen < minlen) minlen = keyLen;
+ if (keyLen > maxlen) maxlen = keyLen;
} else {
dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
- hasValuelessContents = FALSE;
+ hasValuelessContents = TRUE;
+ wordCount++;
+ if (keyLen < minlen) minlen = keyLen;
+ if (keyLen > maxlen) maxlen = keyLen;
}
if (status.isFailure()) {
exit(status.reset());
}
}
+ if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); }
if (!isOk && status.isSuccess()) {
status.set(U_ILLEGAL_ARGUMENT_ERROR);
fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
}
- if (verbose) { puts("Serializing data..."); }
+ if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); }
int32_t outDataSize;
const void *outData;
UnicodeString usp;
outData = usp.getBuffer();
}
if (status.isFailure()) {
- fprintf(stderr, "gendict: got failure of type %s while serializing\n", status.errorName());
+ fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName());
exit(status.reset());
}
if (verbose) { puts("Opening output file..."); }
exit(U_INTERNAL_PROGRAM_ERROR);
}
- printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime());
+ if (!quiet) { printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); }
#ifdef TEST_GENDICT
if (isBytesTrie) {