]> git.saurik.com Git - apple/icu.git/blame - icuSources/samples/ugrep/ugrep.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / samples / ugrep / ugrep.cpp
CommitLineData
f3c0d7a5
A
1/*************************************************************************
2*
3* © 2016 and later: Unicode, Inc. and others.
4* License & terms of use: http://www.unicode.org/copyright.html#License
5*
6**************************************************************************
7**************************************************************************
b75a7d8f 8*
729e4ab9 9* Copyright (C) 2002-2010, International Business Machines
b75a7d8f
A
10* Corporation and others. All Rights Reserved.
11*
12***************************************************************************
13*/
14
15//
16// ugrep - an ICU sample program illustrating the use of ICU Regular Expressions.
17//
18// The use of the ICU Regex API all occurs within the main()
0f5d89e8 19// function. The rest of the code deals with opening files,
b75a7d8f
A
20// encoding conversions, printing results, etc.
21//
22// This is not a full-featured grep program. The command line options
23// have been kept to a minimum to avoid complicating the sample code.
24//
25
26
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31
32#include "unicode/utypes.h"
33#include "unicode/ustring.h"
34#include "unicode/regex.h"
35#include "unicode/ucnv.h"
36#include "unicode/uclean.h"
37
0f5d89e8 38using namespace icu;
b75a7d8f
A
39
40//
0f5d89e8 41// The following variables contain parameters that may be set from the command line.
b75a7d8f
A
42//
43const char *pattern = NULL; // The regular expression
44int firstFileNum; // argv index of the first file name
45UBool displayFileName = FALSE;
46UBool displayLineNum = FALSE;
47
48
49//
50// Info regarding the file currently being processed
51//
52const char *fileName;
53int fileLen; // Length, in UTF-16 Code Units.
54
55UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads
56 // the whole file at once.
57
58char *charBuf = 0; // Buffer, for original, unconverted file data.
59
60
61//
62// Info regarding the line currently being processed
63//
64int lineStart; // Index of first char of the current line in the file buffer
65int lineEnd; // Index of char following the new line sequence for the current line
66int lineNum;
67
68//
69// Converter, used on output to convert Unicode data back to char *
70// so that it will display in non-Unicode terminal windows.
71//
72UConverter *outConverter = 0;
73
74//
75// Function forward declarations
76//
77void processOptions(int argc, const char **argv);
78void nextLine(int start);
79void printMatch();
80void printUsage();
81void readFile(const char *name);
82
83
84
85//------------------------------------------------------------------------------------------
86//
87// main for ugrep
88//
89// Structurally, all use of the ICU Regular Expression API is in main(),
90// and all of the supporting stuff necessary to make a running program, but
91// not directly related to regular expressions, is factored out into these other
92// functions.
93//
94//------------------------------------------------------------------------------------------
95int main(int argc, const char** argv) {
96 UBool matchFound = FALSE;
97
98 //
0f5d89e8 99 // Process the command line options.
b75a7d8f
A
100 //
101 processOptions(argc, argv);
102
103 //
104 // Create a RegexPattern object from the user supplied pattern string.
105 //
106 UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure
107 // in a status variable.
108
109 UParseError parseErr; // In the event of a syntax error in the regex pattern,
110 // this struct will contain the position of the
111 // error.
112
113 RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status);
114 // Note that C++ is doing an automatic conversion
115 // of the (char *) pattern to a temporary
116 // UnicodeString object.
117 if (U_FAILURE(status)) {
118 fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n",
119 u_errorName(status), parseErr.offset);
120 exit(-1);
121 }
122
123 //
124 // Create a RegexMatcher from the newly created pattern.
125 //
126 UnicodeString empty;
127 RegexMatcher *matcher = rePat->matcher(empty, status);
128 if (U_FAILURE(status)) {
129 fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n",
130 u_errorName(status));
131 exit(-1);
132 }
133
134 //
135 // Loop, processing each of the input files.
136 //
137 for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
138 readFile(argv[fileNum]);
139
140 //
141 // Loop through the lines of a file, trying to match the regex pattern on each.
142 //
143 for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
144 UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
145 matcher->reset(s);
146 if (matcher->find()) {
147 matchFound = TRUE;
148 printMatch();
149 }
150 }
151 }
152
153 //
154 // Clean up
155 //
156 delete matcher;
157 delete rePat;
158 free(ucharBuf);
159 free(charBuf);
160 ucnv_close(outConverter);
161
162 u_cleanup(); // shut down ICU, release any cached data it owns.
163
164 return matchFound? 0: 1;
165}
166
167
168
169//------------------------------------------------------------------------------------------
170//
171// doOptions Run through the command line options, and set
172// the global variables accordingly.
173//
0f5d89e8 174// exit without returning if an error occurred and
b75a7d8f
A
175// ugrep should not proceed further.
176//
177//------------------------------------------------------------------------------------------
178void processOptions(int argc, const char **argv) {
179 int optInd;
180 UBool doUsage = FALSE;
181 UBool doVersion = FALSE;
182 const char *arg;
183
184
185 for(optInd = 1; optInd < argc; ++optInd) {
186 arg = argv[optInd];
187
188 /* version info */
189 if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
190 doVersion = TRUE;
191 }
192 /* usage info */
193 else if(strcmp(arg, "--help") == 0) {
194 doUsage = TRUE;
195 }
196 else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
197 displayLineNum = TRUE;
198 }
199 /* POSIX.1 says all arguments after -- are not options */
200 else if(strcmp(arg, "--") == 0) {
201 /* skip the -- */
202 ++optInd;
203 break;
204 }
205 /* unrecognized option */
206 else if(strncmp(arg, "-", strlen("-")) == 0) {
207 printf("ugrep: invalid option -- %s\n", arg+1);
208 doUsage = TRUE;
209 }
210 /* done with options */
211 else {
212 break;
213 }
214 }
215
216 if (doUsage) {
217 printUsage();
218 exit(0);
219 }
220
221 if (doVersion) {
222 printf("ugrep version 0.01\n");
223 if (optInd == argc) {
224 exit(0);
225 }
226 }
227
228 int remainingArgs = argc-optInd; // pattern file ...
229 if (remainingArgs < 2) {
230 fprintf(stderr, "ugrep: files or pattern are missing.\n");
231 printUsage();
232 exit(1);
233 }
234
235 if (remainingArgs > 2) {
236 // More than one file to be processed. Display file names with match output.
237 displayFileName = TRUE;
238 }
239
240 pattern = argv[optInd];
241 firstFileNum = optInd+1;
242}
243
244//------------------------------------------------------------------------------------------
245//
246// printUsage
247//
248//------------------------------------------------------------------------------------------
249void printUsage() {
250 printf("ugrep [options] pattern file...\n"
251 " -V or --version display version information\n"
252 " --help display this help and exit\n"
253 " -- stop further option processing\n"
254 "-n, --line-number Prefix each line of output with the line number within its input file.\n"
255 );
256 exit(0);
257}
258
259//------------------------------------------------------------------------------------------
260//
261// readFile Read a file into memory, and convert it to Unicode.
262//
263// Since this is just a demo program, take the simple minded approach
264// of always reading the whole file at once. No intelligent buffering
265// is done.
266//
267//------------------------------------------------------------------------------------------
268void readFile(const char *name) {
269
270 //
271 // Initialize global file variables
272 //
273 fileName = name;
274 fileLen = 0; // zero length prevents processing in case of errors.
275
276
277 //
278 // Open the file and determine its size.
279 //
280 FILE *file = fopen(name, "rb");
281 if (file == 0 ) {
282 fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
283 return;
284 }
285 fseek(file, 0, SEEK_END);
286 int rawFileLen = ftell(file);
287 fseek(file, 0, SEEK_SET);
288
289
290 //
291 // Read in the file
292 //
293 charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking...
0f5d89e8 294 int t = static_cast<int>(fread(charBuf, 1, rawFileLen, file));
b75a7d8f
A
295 if (t != rawFileLen) {
296 fprintf(stderr, "Error reading file \"%s\"\n", fileName);
729e4ab9 297 fclose(file);
b75a7d8f
A
298 return;
299 }
300 charBuf[rawFileLen]=0;
301 fclose(file);
302
303 //
304 // Look for a Unicode Signature (BOM) in the data
305 //
306 int32_t signatureLength;
307 const char * charDataStart = charBuf;
308 UErrorCode status = U_ZERO_ERROR;
309 const char* encoding = ucnv_detectUnicodeSignature(
310 charDataStart, rawFileLen, &signatureLength, &status);
311 if (U_FAILURE(status)) {
312 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
313 u_errorName(status));
314 return;
315 }
316 if(encoding!=NULL ){
317 charDataStart += signatureLength;
318 rawFileLen -= signatureLength;
319 }
320
321 //
322 // Open a converter to take the file to UTF-16
323 //
324 UConverter* conv;
325 conv = ucnv_open(encoding, &status);
326 if (U_FAILURE(status)) {
327 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
328 return;
329 }
330
331 //
332 // Convert the file data to UChar.
333 // Preflight first to determine required buffer size.
334 //
335 uint32_t destCap = ucnv_toUChars(conv,
336 NULL, // dest,
337 0, // destCapacity,
338 charDataStart,
339 rawFileLen,
340 &status);
341 if (status != U_BUFFER_OVERFLOW_ERROR) {
342 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
343 return;
344 };
345
346 status = U_ZERO_ERROR;
347 ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
348 ucnv_toUChars(conv,
349 ucharBuf, // dest,
350 destCap+1,
351 charDataStart,
352 rawFileLen,
353 &status);
354 if (U_FAILURE(status)) {
355 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
356 return;
357 };
358 ucnv_close(conv);
359
360 //
361 // Successful conversion. Set the global size variables so that
362 // the rest of the processing will proceed for this file.
363 //
364 fileLen = destCap;
365}
366
367
368
369
370
371//------------------------------------------------------------------------------------------
372//
373// nextLine Advance the line index variables, starting at the
374// specified position in the input file buffer, by
0f5d89e8 375// scanning forward until the next end-of-line.
b75a7d8f
A
376//
377// Need to take into account all of the possible Unicode
378// line ending sequences.
379//
380//------------------------------------------------------------------------------------------
381void nextLine(int startPos) {
382 if (startPos == 0) {
383 lineNum = 0;
384 } else {
385 lineNum++;
386 }
387 lineStart = lineEnd = startPos;
388
389 for (;;) {
390 if (lineEnd >= fileLen) {
391 return;
392 }
393 UChar c = ucharBuf[lineEnd];
394 lineEnd++;
395 if (c == 0x0a || // Line Feed
396 c == 0x0c || // Form Feed
397 c == 0x0d || // Carriage Return
398 c == 0x85 || // Next Line
399 c == 0x2028 || // Line Separator
400 c == 0x2029) // Paragraph separator
401 {
402 break;
403 }
404 }
405
406 // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
407 if (lineEnd < fileLen &&
408 ucharBuf[lineEnd-1] == 0x0d &&
409 ucharBuf[lineEnd] == 0x0a)
410 {
411 lineEnd++;
412 }
413}
414
415
416//------------------------------------------------------------------------------------------
417//
418// printMatch Called when a matching line has been located.
419// Print out the line from the file with the match, after
420// converting it back to the default code page.
421//
422//------------------------------------------------------------------------------------------
423void printMatch() {
424 char buf[2000];
425 UErrorCode status = U_ZERO_ERROR;
426
427 // If we haven't already created a converter for output, do it now.
428 if (outConverter == 0) {
429 outConverter = ucnv_open(NULL, &status);
430 if (U_FAILURE(status)) {
431 fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n",
432 u_errorName(status));
433 exit(-1);
434 }
435 };
436
437 // Convert the line to be printed back to the default 8 bit code page.
438 // If the line is too long for our buffer, just truncate it.
439 ucnv_fromUChars(outConverter,
440 buf, // destination buffer for conversion
441 sizeof(buf), // capacity of destination buffer
442 &ucharBuf[lineStart], // Input to conversion
443 lineEnd-lineStart, // number of UChars to convert
444 &status);
445 buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines.
446 // The converter null-terminates its output unless
447 // the buffer completely fills.
448
449 if (displayFileName) {
450 printf("%s:", fileName);
451 }
452 if (displayLineNum) {
453 printf("%d:", lineNum);
454 }
455 printf("%s", buf);
456}
457