1 /*************************************************************************
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
6 **************************************************************************
7 **************************************************************************
9 * Copyright (C) 2002-2010, International Business Machines
10 * Corporation and others. All Rights Reserved.
12 ***************************************************************************
16 // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions.
18 // The use of the ICU Regex API all occurs within the main()
19 // function. The rest of the code deals with opening files,
20 // encoding conversions, printing results, etc.
22 // This is not a full-featured grep program. The command line options
23 // have been kept to a minimum to avoid complicating the sample code.
32 #include "unicode/utypes.h"
33 #include "unicode/ustring.h"
34 #include "unicode/regex.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uclean.h"
41 // The following variables contain parameters that may be set from the command line.
43 const char *pattern
= NULL
; // The regular expression
44 int firstFileNum
; // argv index of the first file name
45 UBool displayFileName
= FALSE
;
46 UBool displayLineNum
= FALSE
;
50 // Info regarding the file currently being processed
53 int fileLen
; // Length, in UTF-16 Code Units.
55 UChar
*ucharBuf
= 0; // Buffer, holds converted file. (Simple minded program, always reads
56 // the whole file at once.
58 char *charBuf
= 0; // Buffer, for original, unconverted file data.
62 // Info regarding the line currently being processed
64 int lineStart
; // Index of first char of the current line in the file buffer
65 int lineEnd
; // Index of char following the new line sequence for the current line
69 // Converter, used on output to convert Unicode data back to char *
70 // so that it will display in non-Unicode terminal windows.
72 UConverter
*outConverter
= 0;
75 // Function forward declarations
77 void processOptions(int argc
, const char **argv
);
78 void nextLine(int start
);
81 void readFile(const char *name
);
85 //------------------------------------------------------------------------------------------
89 // Structurally, all use of the ICU Regular Expression API is in main(),
90 // and all of the supporting stuff necessary to make a running program, but
91 // not directly related to regular expressions, is factored out into these other
94 //------------------------------------------------------------------------------------------
95 int main(int argc
, const char** argv
) {
96 UBool matchFound
= FALSE
;
99 // Process the command line options.
101 processOptions(argc
, argv
);
104 // Create a RegexPattern object from the user supplied pattern string.
106 UErrorCode status
= U_ZERO_ERROR
; // All ICU operations report success or failure
107 // in a status variable.
109 UParseError parseErr
; // In the event of a syntax error in the regex pattern,
110 // this struct will contain the position of the
113 RegexPattern
*rePat
= RegexPattern::compile(pattern
, parseErr
, status
);
114 // Note that C++ is doing an automatic conversion
115 // of the (char *) pattern to a temporary
116 // UnicodeString object.
117 if (U_FAILURE(status
)) {
118 fprintf(stderr
, "ugrep: error in pattern: \"%s\" at position %d\n",
119 u_errorName(status
), parseErr
.offset
);
124 // Create a RegexMatcher from the newly created pattern.
127 RegexMatcher
*matcher
= rePat
->matcher(empty
, status
);
128 if (U_FAILURE(status
)) {
129 fprintf(stderr
, "ugrep: error in creating RegexMatcher: \"%s\"\n",
130 u_errorName(status
));
135 // Loop, processing each of the input files.
137 for (int fileNum
=firstFileNum
; fileNum
< argc
; fileNum
++) {
138 readFile(argv
[fileNum
]);
141 // Loop through the lines of a file, trying to match the regex pattern on each.
143 for (nextLine(0); lineStart
<fileLen
; nextLine(lineEnd
)) {
144 UnicodeString
s(FALSE
, ucharBuf
+lineStart
, lineEnd
-lineStart
);
146 if (matcher
->find()) {
160 ucnv_close(outConverter
);
162 u_cleanup(); // shut down ICU, release any cached data it owns.
164 return matchFound
? 0: 1;
169 //------------------------------------------------------------------------------------------
171 // doOptions Run through the command line options, and set
172 // the global variables accordingly.
174 // exit without returning if an error occurred and
175 // ugrep should not proceed further.
177 //------------------------------------------------------------------------------------------
178 void processOptions(int argc
, const char **argv
) {
180 UBool doUsage
= FALSE
;
181 UBool doVersion
= FALSE
;
185 for(optInd
= 1; optInd
< argc
; ++optInd
) {
189 if(strcmp(arg
, "-V") == 0 || strcmp(arg
, "--version") == 0) {
193 else if(strcmp(arg
, "--help") == 0) {
196 else if(strcmp(arg
, "-n") == 0 || strcmp(arg
, "--line-number") == 0) {
197 displayLineNum
= TRUE
;
199 /* POSIX.1 says all arguments after -- are not options */
200 else if(strcmp(arg
, "--") == 0) {
205 /* unrecognized option */
206 else if(strncmp(arg
, "-", strlen("-")) == 0) {
207 printf("ugrep: invalid option -- %s\n", arg
+1);
210 /* done with options */
222 printf("ugrep version 0.01\n");
223 if (optInd
== argc
) {
228 int remainingArgs
= argc
-optInd
; // pattern file ...
229 if (remainingArgs
< 2) {
230 fprintf(stderr
, "ugrep: files or pattern are missing.\n");
235 if (remainingArgs
> 2) {
236 // More than one file to be processed. Display file names with match output.
237 displayFileName
= TRUE
;
240 pattern
= argv
[optInd
];
241 firstFileNum
= optInd
+1;
244 //------------------------------------------------------------------------------------------
248 //------------------------------------------------------------------------------------------
250 printf("ugrep [options] pattern file...\n"
251 " -V or --version display version information\n"
252 " --help display this help and exit\n"
253 " -- stop further option processing\n"
254 "-n, --line-number Prefix each line of output with the line number within its input file.\n"
259 //------------------------------------------------------------------------------------------
261 // readFile Read a file into memory, and convert it to Unicode.
263 // Since this is just a demo program, take the simple minded approach
264 // of always reading the whole file at once. No intelligent buffering
267 //------------------------------------------------------------------------------------------
268 void readFile(const char *name
) {
271 // Initialize global file variables
274 fileLen
= 0; // zero length prevents processing in case of errors.
278 // Open the file and determine its size.
280 FILE *file
= fopen(name
, "rb");
282 fprintf(stderr
, "ugrep: Could not open file \"%s\"\n", fileName
);
285 fseek(file
, 0, SEEK_END
);
286 int rawFileLen
= ftell(file
);
287 fseek(file
, 0, SEEK_SET
);
293 charBuf
= (char *)realloc(charBuf
, rawFileLen
+1); // Need error checking...
294 int t
= static_cast<int>(fread(charBuf
, 1, rawFileLen
, file
));
295 if (t
!= rawFileLen
) {
296 fprintf(stderr
, "Error reading file \"%s\"\n", fileName
);
300 charBuf
[rawFileLen
]=0;
304 // Look for a Unicode Signature (BOM) in the data
306 int32_t signatureLength
;
307 const char * charDataStart
= charBuf
;
308 UErrorCode status
= U_ZERO_ERROR
;
309 const char* encoding
= ucnv_detectUnicodeSignature(
310 charDataStart
, rawFileLen
, &signatureLength
, &status
);
311 if (U_FAILURE(status
)) {
312 fprintf(stderr
, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
313 u_errorName(status
));
317 charDataStart
+= signatureLength
;
318 rawFileLen
-= signatureLength
;
322 // Open a converter to take the file to UTF-16
325 conv
= ucnv_open(encoding
, &status
);
326 if (U_FAILURE(status
)) {
327 fprintf(stderr
, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status
));
332 // Convert the file data to UChar.
333 // Preflight first to determine required buffer size.
335 uint32_t destCap
= ucnv_toUChars(conv
,
341 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
342 fprintf(stderr
, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
346 status
= U_ZERO_ERROR
;
347 ucharBuf
= (UChar
*)realloc(ucharBuf
, (destCap
+1) * sizeof(UChar
));
354 if (U_FAILURE(status
)) {
355 fprintf(stderr
, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
361 // Successful conversion. Set the global size variables so that
362 // the rest of the processing will proceed for this file.
371 //------------------------------------------------------------------------------------------
373 // nextLine Advance the line index variables, starting at the
374 // specified position in the input file buffer, by
375 // scanning forward until the next end-of-line.
377 // Need to take into account all of the possible Unicode
378 // line ending sequences.
380 //------------------------------------------------------------------------------------------
381 void nextLine(int startPos
) {
387 lineStart
= lineEnd
= startPos
;
390 if (lineEnd
>= fileLen
) {
393 UChar c
= ucharBuf
[lineEnd
];
395 if (c
== 0x0a || // Line Feed
396 c
== 0x0c || // Form Feed
397 c
== 0x0d || // Carriage Return
398 c
== 0x85 || // Next Line
399 c
== 0x2028 || // Line Separator
400 c
== 0x2029) // Paragraph separator
406 // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
407 if (lineEnd
< fileLen
&&
408 ucharBuf
[lineEnd
-1] == 0x0d &&
409 ucharBuf
[lineEnd
] == 0x0a)
416 //------------------------------------------------------------------------------------------
418 // printMatch Called when a matching line has been located.
419 // Print out the line from the file with the match, after
420 // converting it back to the default code page.
422 //------------------------------------------------------------------------------------------
425 UErrorCode status
= U_ZERO_ERROR
;
427 // If we haven't already created a converter for output, do it now.
428 if (outConverter
== 0) {
429 outConverter
= ucnv_open(NULL
, &status
);
430 if (U_FAILURE(status
)) {
431 fprintf(stderr
, "ugrep: Error opening default converter: \"%s\"\n",
432 u_errorName(status
));
437 // Convert the line to be printed back to the default 8 bit code page.
438 // If the line is too long for our buffer, just truncate it.
439 ucnv_fromUChars(outConverter
,
440 buf
, // destination buffer for conversion
441 sizeof(buf
), // capacity of destination buffer
442 &ucharBuf
[lineStart
], // Input to conversion
443 lineEnd
-lineStart
, // number of UChars to convert
445 buf
[sizeof(buf
)-1] = 0; // Add null for use in case of too long lines.
446 // The converter null-terminates its output unless
447 // the buffer completely fills.
449 if (displayFileName
) {
450 printf("%s:", fileName
);
452 if (displayLineNum
) {
453 printf("%d:", lineNum
);