]>
git.saurik.com Git - apple/icu.git/blob - icuSources/samples/ugrep/ugrep.cpp
   1 /************************************************************************* 
   3 *   © 2016 and later: Unicode, Inc. and others. 
   4 *   License & terms of use: http://www.unicode.org/copyright.html#License 
   6 ************************************************************************** 
   7 ************************************************************************** 
   9 *   Copyright (C) 2002-2010, International Business Machines 
  10 *   Corporation and others.  All Rights Reserved. 
  12 *************************************************************************** 
  16 //   ugrep  - an ICU sample program illustrating the use of ICU Regular Expressions. 
  18 //            The use of the ICU Regex API all occurs within the main() 
  19 //            function.  The rest of the code deals with opening files, 
  20 //            encoding conversions, printing results, etc. 
  22 //            This is not a full-featured grep program.  The command line options 
  23 //            have been kept to a minimum to avoid complicating the sample code. 
  32 #include "unicode/utypes.h" 
  33 #include "unicode/ustring.h" 
  34 #include "unicode/regex.h" 
  35 #include "unicode/ucnv.h" 
  36 #include "unicode/uclean.h" 
  41 //  The following variables contain parameters that may be set from the command line. 
  43 const char *pattern 
= NULL
;     // The regular expression 
  44 int        firstFileNum
;        //  argv index of the first file name 
  45 UBool      displayFileName 
= FALSE
; 
  46 UBool      displayLineNum  
= FALSE
; 
  50 //  Info regarding the file currently being processed 
  53 int         fileLen
;              // Length, in UTF-16 Code Units.   
  55 UChar      
*ucharBuf 
= 0;         // Buffer, holds converted file.  (Simple minded program, always reads 
  56                                   //   the whole file at once. 
  58 char       *charBuf 
= 0;          // Buffer, for original, unconverted file data. 
  62 //  Info regarding the line currently being processed 
  64 int      lineStart
;     // Index of first char of the current line in the file buffer 
  65 int      lineEnd
;       // Index of char following the new line sequence for the current line 
  69 //  Converter, used on output to convert Unicode data back to char * 
  70 //             so that it will display in non-Unicode terminal windows. 
  72 UConverter  
*outConverter 
= 0; 
  75 //  Function forward declarations 
  77 void processOptions(int argc
, const char **argv
); 
  78 void nextLine(int start
); 
  81 void readFile(const char *name
); 
  85 //------------------------------------------------------------------------------------------ 
  89 //           Structurally, all use of the ICU Regular Expression API is in main(), 
  90 //           and all of the supporting stuff necessary to make a running program, but 
  91 //           not directly related to regular expressions, is factored out into these other 
  94 //------------------------------------------------------------------------------------------ 
  95 int main(int argc
, const char** argv
) { 
  96     UBool     matchFound 
= FALSE
; 
  99     //  Process the command line options. 
 101     processOptions(argc
, argv
); 
 104     // Create a RegexPattern object from the user supplied pattern string. 
 106     UErrorCode status 
= U_ZERO_ERROR
;   // All ICU operations report success or failure 
 107                                         //   in a status variable. 
 109     UParseError    parseErr
;            // In the event of a syntax error in the regex pattern, 
 110                                         //   this struct will contain the position of the 
 113     RegexPattern  
*rePat 
= RegexPattern::compile(pattern
, parseErr
, status
); 
 114                                         // Note that C++ is doing an automatic conversion 
 115                                         //  of the (char *) pattern to a temporary 
 116                                         //  UnicodeString object. 
 117     if (U_FAILURE(status
)) { 
 118         fprintf(stderr
, "ugrep:  error in pattern: \"%s\" at position %d\n", 
 119             u_errorName(status
), parseErr
.offset
); 
 124     // Create a RegexMatcher from the newly created pattern. 
 127     RegexMatcher 
*matcher 
= rePat
->matcher(empty
, status
); 
 128     if (U_FAILURE(status
)) { 
 129         fprintf(stderr
, "ugrep:  error in creating RegexMatcher: \"%s\"\n", 
 130             u_errorName(status
)); 
 135     // Loop, processing each of the input files. 
 137     for (int fileNum
=firstFileNum
; fileNum 
< argc
; fileNum
++) { 
 138         readFile(argv
[fileNum
]); 
 141         //  Loop through the lines of a file, trying to match the regex pattern on each. 
 143         for (nextLine(0); lineStart
<fileLen
; nextLine(lineEnd
)) { 
 144             UnicodeString 
s(FALSE
, ucharBuf
+lineStart
, lineEnd
-lineStart
); 
 146             if (matcher
->find()) { 
 160     ucnv_close(outConverter
); 
 162     u_cleanup();       // shut down ICU, release any cached data it owns. 
 164     return matchFound
? 0: 1; 
 169 //------------------------------------------------------------------------------------------ 
 171 //   doOptions          Run through the command line options, and set 
 172 //                      the global variables accordingly. 
 174 //                      exit without returning if an error occurred and 
 175 //                      ugrep should not proceed further. 
 177 //------------------------------------------------------------------------------------------ 
 178 void processOptions(int argc
, const char **argv
) { 
 180     UBool          doUsage   
= FALSE
; 
 181     UBool          doVersion 
= FALSE
; 
 185     for(optInd 
= 1; optInd 
< argc
; ++optInd
) { 
 189         if(strcmp(arg
, "-V") == 0 || strcmp(arg
, "--version") == 0) { 
 193         else if(strcmp(arg
, "--help") == 0) { 
 196         else if(strcmp(arg
, "-n") == 0 || strcmp(arg
, "--line-number") == 0) { 
 197             displayLineNum 
= TRUE
; 
 199         /* POSIX.1 says all arguments after -- are not options */ 
 200         else if(strcmp(arg
, "--") == 0) { 
 205         /* unrecognized option */ 
 206         else if(strncmp(arg
, "-", strlen("-")) == 0) { 
 207             printf("ugrep: invalid option -- %s\n", arg
+1); 
 210         /* done with options */ 
 222         printf("ugrep version 0.01\n"); 
 223         if (optInd 
== argc
) { 
 228     int  remainingArgs 
= argc
-optInd
;     // pattern file ... 
 229     if (remainingArgs 
< 2) { 
 230         fprintf(stderr
, "ugrep:  files or pattern are missing.\n"); 
 235     if (remainingArgs 
> 2) { 
 236         // More than one file to be processed.   Display file names with match output. 
 237         displayFileName 
= TRUE
; 
 240     pattern      
= argv
[optInd
]; 
 241     firstFileNum 
= optInd
+1; 
 244 //------------------------------------------------------------------------------------------ 
 248 //------------------------------------------------------------------------------------------ 
 250     printf("ugrep [options] pattern file...\n" 
 251         "     -V or --version     display version information\n" 
 252         "     --help              display this help and exit\n" 
 253         "     --                  stop further option processing\n" 
 254         "-n,  --line-number       Prefix each line of output with the line number within its input file.\n" 
 259 //------------------------------------------------------------------------------------------ 
 261 //    readFile          Read a file into memory, and convert it to Unicode. 
 263 //                      Since this is just a demo program, take the simple minded approach 
 264 //                      of always reading the whole file at once.  No intelligent buffering 
 267 //------------------------------------------------------------------------------------------ 
 268 void readFile(const char *name
) { 
 271     //  Initialize global file variables 
 274     fileLen  
= 0;      // zero length prevents processing in case of errors. 
 278     //  Open the file and determine its size. 
 280     FILE *file 
= fopen(name
, "rb"); 
 282         fprintf(stderr
, "ugrep: Could not open file \"%s\"\n", fileName
); 
 285     fseek(file
, 0, SEEK_END
); 
 286     int rawFileLen 
= ftell(file
); 
 287     fseek(file
, 0, SEEK_SET
); 
 293     charBuf    
= (char *)realloc(charBuf
, rawFileLen
+1);   // Need error checking... 
 294     int t 
= static_cast<int>(fread(charBuf
, 1, rawFileLen
, file
)); 
 295     if (t 
!= rawFileLen
)  { 
 296         fprintf(stderr
, "Error reading file \"%s\"\n", fileName
); 
 300     charBuf
[rawFileLen
]=0; 
 304     // Look for a Unicode Signature (BOM) in the data 
 306     int32_t        signatureLength
; 
 307     const char *   charDataStart 
= charBuf
; 
 308     UErrorCode     status        
= U_ZERO_ERROR
; 
 309     const char*    encoding      
= ucnv_detectUnicodeSignature( 
 310                            charDataStart
, rawFileLen
, &signatureLength
, &status
); 
 311     if (U_FAILURE(status
)) { 
 312         fprintf(stderr
, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n", 
 313             u_errorName(status
)); 
 317         charDataStart  
+= signatureLength
; 
 318         rawFileLen     
-= signatureLength
; 
 322     // Open a converter to take the file to UTF-16 
 325     conv 
= ucnv_open(encoding
, &status
); 
 326     if (U_FAILURE(status
)) { 
 327         fprintf(stderr
, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status
)); 
 332     // Convert the file data to UChar. 
 333     //  Preflight first to determine required buffer size. 
 335     uint32_t destCap 
= ucnv_toUChars(conv
, 
 341     if (status 
!= U_BUFFER_OVERFLOW_ERROR
) { 
 342         fprintf(stderr
, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
)); 
 346     status 
= U_ZERO_ERROR
; 
 347     ucharBuf 
= (UChar 
*)realloc(ucharBuf
, (destCap
+1) * sizeof(UChar
)); 
 354     if (U_FAILURE(status
)) { 
 355         fprintf(stderr
, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
)); 
 361     //  Successful conversion.  Set the global size variables so that 
 362     //     the rest of the processing will proceed for this file. 
 371 //------------------------------------------------------------------------------------------ 
 373 //   nextLine           Advance the line index variables, starting at the 
 374 //                      specified position in the input file buffer, by 
 375 //                      scanning forward until the next end-of-line. 
 377 //                      Need to take into account all of the possible Unicode 
 378 //                      line ending sequences. 
 380 //------------------------------------------------------------------------------------------ 
 381 void nextLine(int  startPos
) { 
 387     lineStart 
= lineEnd 
= startPos
; 
 390         if (lineEnd 
>= fileLen
) { 
 393         UChar c 
= ucharBuf
[lineEnd
]; 
 395         if (c 
== 0x0a   ||       // Line Feed 
 396             c 
== 0x0c   ||       // Form Feed 
 397             c 
== 0x0d   ||       // Carriage Return 
 398             c 
== 0x85   ||       // Next Line 
 399             c 
== 0x2028 ||       // Line Separator 
 400             c 
== 0x2029)         // Paragraph separator 
 406     // Check for CR/LF sequence, and advance over the LF if we're in the middle of one. 
 407     if (lineEnd 
< fileLen           
&& 
 408         ucharBuf
[lineEnd
-1] == 0x0d && 
 409         ucharBuf
[lineEnd
]   == 0x0a)  
 416 //------------------------------------------------------------------------------------------ 
 418 //   printMatch         Called when a matching line has been located. 
 419 //                      Print out the line from the file with the match, after 
 420 //                         converting it back to the default code page. 
 422 //------------------------------------------------------------------------------------------ 
 425     UErrorCode         status       
= U_ZERO_ERROR
; 
 427     // If we haven't already created a converter for output, do it now. 
 428     if (outConverter 
== 0) { 
 429         outConverter 
= ucnv_open(NULL
, &status
); 
 430         if (U_FAILURE(status
)) { 
 431             fprintf(stderr
, "ugrep:  Error opening default converter: \"%s\"\n", 
 432                 u_errorName(status
)); 
 437     // Convert the line to be printed back to the default 8 bit code page. 
 438     //   If the line is too long for our buffer, just truncate it. 
 439     ucnv_fromUChars(outConverter
, 
 440                     buf
,                   // destination buffer for conversion 
 441                     sizeof(buf
),           // capacity of destination buffer 
 442                     &ucharBuf
[lineStart
],   // Input to conversion 
 443                     lineEnd
-lineStart
,     // number of UChars to convert 
 445     buf
[sizeof(buf
)-1] = 0;                // Add null for use in case of too long lines. 
 446                                            // The converter null-terminates its output unless 
 447                                            //   the buffer completely fills. 
 449     if (displayFileName
) { 
 450         printf("%s:", fileName
); 
 452     if (displayLineNum
) { 
 453         printf("%d:", lineNum
);