ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / samples / ugrep / ugrep.cpp
1 /*************************************************************************
2 *
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
5 *
6 **************************************************************************
7 **************************************************************************
8 *
9 * Copyright (C) 2002-2010, International Business Machines
10 * Corporation and others. All Rights Reserved.
11 *
12 ***************************************************************************
13 */
14
15 //
16 // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions.
17 //
18 // The use of the ICU Regex API all occurs within the main()
19 // function. The rest of the code deals with opening files,
20 // encoding conversions, printing results, etc.
21 //
22 // This is not a full-featured grep program. The command line options
23 // have been kept to a minimum to avoid complicating the sample code.
24 //
25
26
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "unicode/utypes.h"
33 #include "unicode/ustring.h"
34 #include "unicode/regex.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uclean.h"
37
38 using namespace icu;
39
40 //
41 // The following variables contain parameters that may be set from the command line.
42 //
43 const char *pattern = NULL; // The regular expression
44 int firstFileNum; // argv index of the first file name
45 UBool displayFileName = FALSE;
46 UBool displayLineNum = FALSE;
47
48
49 //
50 // Info regarding the file currently being processed
51 //
52 const char *fileName;
53 int fileLen; // Length, in UTF-16 Code Units.
54
55 UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads
56 // the whole file at once.
57
58 char *charBuf = 0; // Buffer, for original, unconverted file data.
59
60
61 //
62 // Info regarding the line currently being processed
63 //
64 int lineStart; // Index of first char of the current line in the file buffer
65 int lineEnd; // Index of char following the new line sequence for the current line
66 int lineNum;
67
68 //
69 // Converter, used on output to convert Unicode data back to char *
70 // so that it will display in non-Unicode terminal windows.
71 //
72 UConverter *outConverter = 0;
73
74 //
75 // Function forward declarations
76 //
77 void processOptions(int argc, const char **argv);
78 void nextLine(int start);
79 void printMatch();
80 void printUsage();
81 void readFile(const char *name);
82
83
84
85 //------------------------------------------------------------------------------------------
86 //
87 // main for ugrep
88 //
89 // Structurally, all use of the ICU Regular Expression API is in main(),
90 // and all of the supporting stuff necessary to make a running program, but
91 // not directly related to regular expressions, is factored out into these other
92 // functions.
93 //
94 //------------------------------------------------------------------------------------------
95 int main(int argc, const char** argv) {
96 UBool matchFound = FALSE;
97
98 //
99 // Process the command line options.
100 //
101 processOptions(argc, argv);
102
103 //
104 // Create a RegexPattern object from the user supplied pattern string.
105 //
106 UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure
107 // in a status variable.
108
109 UParseError parseErr; // In the event of a syntax error in the regex pattern,
110 // this struct will contain the position of the
111 // error.
112
113 RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status);
114 // Note that C++ is doing an automatic conversion
115 // of the (char *) pattern to a temporary
116 // UnicodeString object.
117 if (U_FAILURE(status)) {
118 fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n",
119 u_errorName(status), parseErr.offset);
120 exit(-1);
121 }
122
123 //
124 // Create a RegexMatcher from the newly created pattern.
125 //
126 UnicodeString empty;
127 RegexMatcher *matcher = rePat->matcher(empty, status);
128 if (U_FAILURE(status)) {
129 fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n",
130 u_errorName(status));
131 exit(-1);
132 }
133
134 //
135 // Loop, processing each of the input files.
136 //
137 for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
138 readFile(argv[fileNum]);
139
140 //
141 // Loop through the lines of a file, trying to match the regex pattern on each.
142 //
143 for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
144 UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
145 matcher->reset(s);
146 if (matcher->find()) {
147 matchFound = TRUE;
148 printMatch();
149 }
150 }
151 }
152
153 //
154 // Clean up
155 //
156 delete matcher;
157 delete rePat;
158 free(ucharBuf);
159 free(charBuf);
160 ucnv_close(outConverter);
161
162 u_cleanup(); // shut down ICU, release any cached data it owns.
163
164 return matchFound? 0: 1;
165 }
166
167
168
169 //------------------------------------------------------------------------------------------
170 //
171 // doOptions Run through the command line options, and set
172 // the global variables accordingly.
173 //
174 // exit without returning if an error occurred and
175 // ugrep should not proceed further.
176 //
177 //------------------------------------------------------------------------------------------
178 void processOptions(int argc, const char **argv) {
179 int optInd;
180 UBool doUsage = FALSE;
181 UBool doVersion = FALSE;
182 const char *arg;
183
184
185 for(optInd = 1; optInd < argc; ++optInd) {
186 arg = argv[optInd];
187
188 /* version info */
189 if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
190 doVersion = TRUE;
191 }
192 /* usage info */
193 else if(strcmp(arg, "--help") == 0) {
194 doUsage = TRUE;
195 }
196 else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
197 displayLineNum = TRUE;
198 }
199 /* POSIX.1 says all arguments after -- are not options */
200 else if(strcmp(arg, "--") == 0) {
201 /* skip the -- */
202 ++optInd;
203 break;
204 }
205 /* unrecognized option */
206 else if(strncmp(arg, "-", strlen("-")) == 0) {
207 printf("ugrep: invalid option -- %s\n", arg+1);
208 doUsage = TRUE;
209 }
210 /* done with options */
211 else {
212 break;
213 }
214 }
215
216 if (doUsage) {
217 printUsage();
218 exit(0);
219 }
220
221 if (doVersion) {
222 printf("ugrep version 0.01\n");
223 if (optInd == argc) {
224 exit(0);
225 }
226 }
227
228 int remainingArgs = argc-optInd; // pattern file ...
229 if (remainingArgs < 2) {
230 fprintf(stderr, "ugrep: files or pattern are missing.\n");
231 printUsage();
232 exit(1);
233 }
234
235 if (remainingArgs > 2) {
236 // More than one file to be processed. Display file names with match output.
237 displayFileName = TRUE;
238 }
239
240 pattern = argv[optInd];
241 firstFileNum = optInd+1;
242 }
243
244 //------------------------------------------------------------------------------------------
245 //
246 // printUsage
247 //
248 //------------------------------------------------------------------------------------------
249 void printUsage() {
250 printf("ugrep [options] pattern file...\n"
251 " -V or --version display version information\n"
252 " --help display this help and exit\n"
253 " -- stop further option processing\n"
254 "-n, --line-number Prefix each line of output with the line number within its input file.\n"
255 );
256 exit(0);
257 }
258
259 //------------------------------------------------------------------------------------------
260 //
261 // readFile Read a file into memory, and convert it to Unicode.
262 //
263 // Since this is just a demo program, take the simple minded approach
264 // of always reading the whole file at once. No intelligent buffering
265 // is done.
266 //
267 //------------------------------------------------------------------------------------------
268 void readFile(const char *name) {
269
270 //
271 // Initialize global file variables
272 //
273 fileName = name;
274 fileLen = 0; // zero length prevents processing in case of errors.
275
276
277 //
278 // Open the file and determine its size.
279 //
280 FILE *file = fopen(name, "rb");
281 if (file == 0 ) {
282 fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
283 return;
284 }
285 fseek(file, 0, SEEK_END);
286 int rawFileLen = ftell(file);
287 fseek(file, 0, SEEK_SET);
288
289
290 //
291 // Read in the file
292 //
293 charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking...
294 int t = static_cast<int>(fread(charBuf, 1, rawFileLen, file));
295 if (t != rawFileLen) {
296 fprintf(stderr, "Error reading file \"%s\"\n", fileName);
297 fclose(file);
298 return;
299 }
300 charBuf[rawFileLen]=0;
301 fclose(file);
302
303 //
304 // Look for a Unicode Signature (BOM) in the data
305 //
306 int32_t signatureLength;
307 const char * charDataStart = charBuf;
308 UErrorCode status = U_ZERO_ERROR;
309 const char* encoding = ucnv_detectUnicodeSignature(
310 charDataStart, rawFileLen, &signatureLength, &status);
311 if (U_FAILURE(status)) {
312 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
313 u_errorName(status));
314 return;
315 }
316 if(encoding!=NULL ){
317 charDataStart += signatureLength;
318 rawFileLen -= signatureLength;
319 }
320
321 //
322 // Open a converter to take the file to UTF-16
323 //
324 UConverter* conv;
325 conv = ucnv_open(encoding, &status);
326 if (U_FAILURE(status)) {
327 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
328 return;
329 }
330
331 //
332 // Convert the file data to UChar.
333 // Preflight first to determine required buffer size.
334 //
335 uint32_t destCap = ucnv_toUChars(conv,
336 NULL, // dest,
337 0, // destCapacity,
338 charDataStart,
339 rawFileLen,
340 &status);
341 if (status != U_BUFFER_OVERFLOW_ERROR) {
342 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
343 return;
344 };
345
346 status = U_ZERO_ERROR;
347 ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
348 ucnv_toUChars(conv,
349 ucharBuf, // dest,
350 destCap+1,
351 charDataStart,
352 rawFileLen,
353 &status);
354 if (U_FAILURE(status)) {
355 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
356 return;
357 };
358 ucnv_close(conv);
359
360 //
361 // Successful conversion. Set the global size variables so that
362 // the rest of the processing will proceed for this file.
363 //
364 fileLen = destCap;
365 }
366
367
368
369
370
371 //------------------------------------------------------------------------------------------
372 //
373 // nextLine Advance the line index variables, starting at the
374 // specified position in the input file buffer, by
375 // scanning forward until the next end-of-line.
376 //
377 // Need to take into account all of the possible Unicode
378 // line ending sequences.
379 //
380 //------------------------------------------------------------------------------------------
381 void nextLine(int startPos) {
382 if (startPos == 0) {
383 lineNum = 0;
384 } else {
385 lineNum++;
386 }
387 lineStart = lineEnd = startPos;
388
389 for (;;) {
390 if (lineEnd >= fileLen) {
391 return;
392 }
393 UChar c = ucharBuf[lineEnd];
394 lineEnd++;
395 if (c == 0x0a || // Line Feed
396 c == 0x0c || // Form Feed
397 c == 0x0d || // Carriage Return
398 c == 0x85 || // Next Line
399 c == 0x2028 || // Line Separator
400 c == 0x2029) // Paragraph separator
401 {
402 break;
403 }
404 }
405
406 // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
407 if (lineEnd < fileLen &&
408 ucharBuf[lineEnd-1] == 0x0d &&
409 ucharBuf[lineEnd] == 0x0a)
410 {
411 lineEnd++;
412 }
413 }
414
415
416 //------------------------------------------------------------------------------------------
417 //
418 // printMatch Called when a matching line has been located.
419 // Print out the line from the file with the match, after
420 // converting it back to the default code page.
421 //
422 //------------------------------------------------------------------------------------------
423 void printMatch() {
424 char buf[2000];
425 UErrorCode status = U_ZERO_ERROR;
426
427 // If we haven't already created a converter for output, do it now.
428 if (outConverter == 0) {
429 outConverter = ucnv_open(NULL, &status);
430 if (U_FAILURE(status)) {
431 fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n",
432 u_errorName(status));
433 exit(-1);
434 }
435 };
436
437 // Convert the line to be printed back to the default 8 bit code page.
438 // If the line is too long for our buffer, just truncate it.
439 ucnv_fromUChars(outConverter,
440 buf, // destination buffer for conversion
441 sizeof(buf), // capacity of destination buffer
442 &ucharBuf[lineStart], // Input to conversion
443 lineEnd-lineStart, // number of UChars to convert
444 &status);
445 buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines.
446 // The converter null-terminates its output unless
447 // the buffer completely fills.
448
449 if (displayFileName) {
450 printf("%s:", fileName);
451 }
452 if (displayLineNum) {
453 printf("%d:", lineNum);
454 }
455 printf("%s", buf);
456 }
457