icuSources/samples/ugrep/ugrep.cpp

   1 /*************************************************************************
   2 *
   3 *   © 2016 and later: Unicode, Inc. and others.
   4 *   License & terms of use: http://www.unicode.org/copyright.html#License
   5 *
   6 **************************************************************************
   7 **************************************************************************
   8 *
   9 *   Copyright (C) 2002-2010, International Business Machines
  10 *   Corporation and others.  All Rights Reserved.
  11 *
  12 ***************************************************************************
  13 */
  14
  15 //
  16 //   ugrep  - an ICU sample program illustrating the use of ICU Regular Expressions.
  17 //
  18 //            The use of the ICU Regex API all occurs within the main()
  19 //            function.  The rest of the code deals with opening files,
  20 //            encoding conversions, printing results, etc.
  21 //
  22 //            This is not a full-featured grep program.  The command line options
  23 //            have been kept to a minimum to avoid complicating the sample code.
  24 //
  25
  26
  27
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31
  32 #include "unicode/utypes.h"
  33 #include "unicode/ustring.h"
  34 #include "unicode/regex.h"
  35 #include "unicode/ucnv.h"
  36 #include "unicode/uclean.h"
  37
  38 using namespace icu;
  39
  40 //
  41 //  The following variables contain parameters that may be set from the command line.
  42 //
  43 const char *pattern = NULL;     // The regular expression
  44 int        firstFileNum;        //  argv index of the first file name
  45 UBool      displayFileName = FALSE;
  46 UBool      displayLineNum  = FALSE;
  47
  48
  49 //
  50 //  Info regarding the file currently being processed
  51 //
  52 const char *fileName;
  53 int         fileLen;              // Length, in UTF-16 Code Units.
  54
  55 UChar      *ucharBuf = 0;         // Buffer, holds converted file.  (Simple minded program, always reads
  56                                   //   the whole file at once.
  57
  58 char       *charBuf = 0;          // Buffer, for original, unconverted file data.
  59
  60
  61 //
  62 //  Info regarding the line currently being processed
  63 //
  64 int      lineStart;     // Index of first char of the current line in the file buffer
  65 int      lineEnd;       // Index of char following the new line sequence for the current line
  66 int      lineNum;
  67
  68 //
  69 //  Converter, used on output to convert Unicode data back to char *
  70 //             so that it will display in non-Unicode terminal windows.
  71 //
  72 UConverter  *outConverter = 0;
  73
  74 //
  75 //  Function forward declarations
  76 //
  77 void processOptions(int argc, const char **argv);
  78 void nextLine(int start);
  79 void printMatch();
  80 void printUsage();
  81 void readFile(const char *name);
  82
  83
  84
  85 //------------------------------------------------------------------------------------------
  86 //
  87 //   main          for ugrep
  88 //
  89 //           Structurally, all use of the ICU Regular Expression API is in main(),
  90 //           and all of the supporting stuff necessary to make a running program, but
  91 //           not directly related to regular expressions, is factored out into these other
  92 //           functions.
  93 //
  94 //------------------------------------------------------------------------------------------
  95 int main(int argc, const char** argv) {
  96     UBool     matchFound = FALSE;
  97
  98     //
  99     //  Process the command line options.
 100     //
 101     processOptions(argc, argv);
 102
 103     //
 104     // Create a RegexPattern object from the user supplied pattern string.
 105     //
 106     UErrorCode status = U_ZERO_ERROR;   // All ICU operations report success or failure
 107                                         //   in a status variable.
 108
 109     UParseError    parseErr;            // In the event of a syntax error in the regex pattern,
 110                                         //   this struct will contain the position of the
 111                                         //   error.
 112
 113     RegexPattern  *rePat = RegexPattern::compile(pattern, parseErr, status);
 114                                         // Note that C++ is doing an automatic conversion
 115                                         //  of the (char *) pattern to a temporary
 116                                         //  UnicodeString object.
 117     if (U_FAILURE(status)) {
 118         fprintf(stderr, "ugrep:  error in pattern: \"%s\" at position %d\n",
 119             u_errorName(status), parseErr.offset);
 120         exit(-1);
 121     }
 122
 123     //
 124     // Create a RegexMatcher from the newly created pattern.
 125     //
 126     UnicodeString empty;
 127     RegexMatcher *matcher = rePat->matcher(empty, status);
 128     if (U_FAILURE(status)) {
 129         fprintf(stderr, "ugrep:  error in creating RegexMatcher: \"%s\"\n",
 130             u_errorName(status));
 131         exit(-1);
 132     }
 133
 134     //
 135     // Loop, processing each of the input files.
 136     //
 137     for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
 138         readFile(argv[fileNum]);
 139
 140         //
 141         //  Loop through the lines of a file, trying to match the regex pattern on each.
 142         //
 143         for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
 144             UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
 145             matcher->reset(s);
 146             if (matcher->find()) {
 147                 matchFound = TRUE;
 148                 printMatch();
 149             }
 150         }
 151     }
 152
 153     //
 154     //  Clean up
 155     //
 156     delete matcher;
 157     delete rePat;
 158     free(ucharBuf);
 159     free(charBuf);
 160     ucnv_close(outConverter);
 161
 162     u_cleanup();       // shut down ICU, release any cached data it owns.
 163
 164     return matchFound? 0: 1;
 165 }
 166
 167
 168
 169 //------------------------------------------------------------------------------------------
 170 //
 171 //   doOptions          Run through the command line options, and set
 172 //                      the global variables accordingly.
 173 //
 174 //                      exit without returning if an error occurred and
 175 //                      ugrep should not proceed further.
 176 //
 177 //------------------------------------------------------------------------------------------
 178 void processOptions(int argc, const char **argv) {
 179     int            optInd;
 180     UBool          doUsage   = FALSE;
 181     UBool          doVersion = FALSE;
 182     const char    *arg;
 183
 184
 185     for(optInd = 1; optInd < argc; ++optInd) {
 186         arg = argv[optInd];
 187
 188         /* version info */
 189         if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
 190             doVersion = TRUE;
 191         }
 192         /* usage info */
 193         else if(strcmp(arg, "--help") == 0) {
 194             doUsage = TRUE;
 195         }
 196         else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
 197             displayLineNum = TRUE;
 198         }
 199         /* POSIX.1 says all arguments after -- are not options */
 200         else if(strcmp(arg, "--") == 0) {
 201             /* skip the -- */
 202             ++optInd;
 203             break;
 204         }
 205         /* unrecognized option */
 206         else if(strncmp(arg, "-", strlen("-")) == 0) {
 207             printf("ugrep: invalid option -- %s\n", arg+1);
 208             doUsage = TRUE;
 209         }
 210         /* done with options */
 211         else {
 212             break;
 213         }
 214     }
 215
 216     if (doUsage) {
 217         printUsage();
 218         exit(0);
 219     }
 220
 221     if (doVersion) {
 222         printf("ugrep version 0.01\n");
 223         if (optInd == argc) {
 224             exit(0);
 225         }
 226     }
 227
 228     int  remainingArgs = argc-optInd;     // pattern file ...
 229     if (remainingArgs < 2) {
 230         fprintf(stderr, "ugrep:  files or pattern are missing.\n");
 231         printUsage();
 232         exit(1);
 233     }
 234
 235     if (remainingArgs > 2) {
 236         // More than one file to be processed.   Display file names with match output.
 237         displayFileName = TRUE;
 238     }
 239
 240     pattern      = argv[optInd];
 241     firstFileNum = optInd+1;
 242 }
 243
 244 //------------------------------------------------------------------------------------------
 245 //
 246 //   printUsage
 247 //
 248 //------------------------------------------------------------------------------------------
 249 void printUsage() {
 250     printf("ugrep [options] pattern file...\n"
 251         "     -V or --version     display version information\n"
 252         "     --help              display this help and exit\n"
 253         "     --                  stop further option processing\n"
 254         "-n,  --line-number       Prefix each line of output with the line number within its input file.\n"
 255         );
 256     exit(0);
 257 }
 258
 259 //------------------------------------------------------------------------------------------
 260 //
 261 //    readFile          Read a file into memory, and convert it to Unicode.
 262 //
 263 //                      Since this is just a demo program, take the simple minded approach
 264 //                      of always reading the whole file at once.  No intelligent buffering
 265 //                      is done.
 266 //
 267 //------------------------------------------------------------------------------------------
 268 void readFile(const char *name) {
 269
 270     //
 271     //  Initialize global file variables
 272     //
 273     fileName = name;
 274     fileLen  = 0;      // zero length prevents processing in case of errors.
 275
 276
 277     //
 278     //  Open the file and determine its size.
 279     //
 280     FILE *file = fopen(name, "rb");
 281     if (file == 0 ) {
 282         fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
 283         return;
 284     }
 285     fseek(file, 0, SEEK_END);
 286     int rawFileLen = ftell(file);
 287     fseek(file, 0, SEEK_SET);
 288
 289
 290     //
 291     //   Read in the file
 292     //
 293     charBuf    = (char *)realloc(charBuf, rawFileLen+1);   // Need error checking...
 294     int t = static_cast<int>(fread(charBuf, 1, rawFileLen, file));
 295     if (t != rawFileLen)  {
 296         fprintf(stderr, "Error reading file \"%s\"\n", fileName);
 297         fclose(file);
 298         return;
 299     }
 300     charBuf[rawFileLen]=0;
 301     fclose(file);
 302
 303     //
 304     // Look for a Unicode Signature (BOM) in the data
 305     //
 306     int32_t        signatureLength;
 307     const char *   charDataStart = charBuf;
 308     UErrorCode     status        = U_ZERO_ERROR;
 309     const char*    encoding      = ucnv_detectUnicodeSignature(
 310                            charDataStart, rawFileLen, &signatureLength, &status);
 311     if (U_FAILURE(status)) {
 312         fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
 313             u_errorName(status));
 314         return;
 315     }
 316     if(encoding!=NULL ){
 317         charDataStart  += signatureLength;
 318         rawFileLen     -= signatureLength;
 319     }
 320
 321     //
 322     // Open a converter to take the file to UTF-16
 323     //
 324     UConverter* conv;
 325     conv = ucnv_open(encoding, &status);
 326     if (U_FAILURE(status)) {
 327         fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
 328         return;
 329     }
 330
 331     //
 332     // Convert the file data to UChar.
 333     //  Preflight first to determine required buffer size.
 334     //
 335     uint32_t destCap = ucnv_toUChars(conv,
 336                        NULL,           //  dest,
 337                        0,              //  destCapacity,
 338                        charDataStart,
 339                        rawFileLen,
 340                        &status);
 341     if (status != U_BUFFER_OVERFLOW_ERROR) {
 342         fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
 343         return;
 344     };
 345
 346     status = U_ZERO_ERROR;
 347     ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
 348     ucnv_toUChars(conv,
 349         ucharBuf,           //  dest,
 350         destCap+1,
 351         charDataStart,
 352         rawFileLen,
 353         &status);
 354     if (U_FAILURE(status)) {
 355         fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
 356         return;
 357     };
 358     ucnv_close(conv);
 359
 360     //
 361     //  Successful conversion.  Set the global size variables so that
 362     //     the rest of the processing will proceed for this file.
 363     //
 364     fileLen = destCap;
 365 }
 366
 367
 368
 369
 370
 371 //------------------------------------------------------------------------------------------
 372 //
 373 //   nextLine           Advance the line index variables, starting at the
 374 //                      specified position in the input file buffer, by
 375 //                      scanning forward until the next end-of-line.
 376 //
 377 //                      Need to take into account all of the possible Unicode
 378 //                      line ending sequences.
 379 //
 380 //------------------------------------------------------------------------------------------
 381 void nextLine(int  startPos) {
 382     if (startPos == 0) {
 383         lineNum = 0;
 384     } else {
 385         lineNum++;
 386     }
 387     lineStart = lineEnd = startPos;
 388
 389     for (;;) {
 390         if (lineEnd >= fileLen) {
 391             return;
 392         }
 393         UChar c = ucharBuf[lineEnd];
 394         lineEnd++;
 395         if (c == 0x0a   ||       // Line Feed
 396             c == 0x0c   ||       // Form Feed
 397             c == 0x0d   ||       // Carriage Return
 398             c == 0x85   ||       // Next Line
 399             c == 0x2028 ||       // Line Separator
 400             c == 0x2029)         // Paragraph separator
 401         {
 402             break;
 403         }
 404     }
 405
 406     // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
 407     if (lineEnd < fileLen           &&
 408         ucharBuf[lineEnd-1] == 0x0d &&
 409         ucharBuf[lineEnd]   == 0x0a)
 410     {
 411         lineEnd++;
 412     }
 413 }
 414
 415
 416 //------------------------------------------------------------------------------------------
 417 //
 418 //   printMatch         Called when a matching line has been located.
 419 //                      Print out the line from the file with the match, after
 420 //                         converting it back to the default code page.
 421 //
 422 //------------------------------------------------------------------------------------------
 423 void printMatch() {
 424     char                buf[2000];
 425     UErrorCode         status       = U_ZERO_ERROR;
 426
 427     // If we haven't already created a converter for output, do it now.
 428     if (outConverter == 0) {
 429         outConverter = ucnv_open(NULL, &status);
 430         if (U_FAILURE(status)) {
 431             fprintf(stderr, "ugrep:  Error opening default converter: \"%s\"\n",
 432                 u_errorName(status));
 433             exit(-1);
 434         }
 435     };
 436
 437     // Convert the line to be printed back to the default 8 bit code page.
 438     //   If the line is too long for our buffer, just truncate it.
 439     ucnv_fromUChars(outConverter,
 440                     buf,                   // destination buffer for conversion
 441                     sizeof(buf),           // capacity of destination buffer
 442                     &ucharBuf[lineStart],   // Input to conversion
 443                     lineEnd-lineStart,     // number of UChars to convert
 444                     &status);
 445     buf[sizeof(buf)-1] = 0;                // Add null for use in case of too long lines.
 446                                            // The converter null-terminates its output unless
 447                                            //   the buffer completely fills.
 448
 449     if (displayFileName) {
 450         printf("%s:", fileName);
 451     }
 452     if (displayLineNum) {
 453         printf("%d:", lineNum);
 454     }
 455     printf("%s", buf);
 456 }
 457