]> git.saurik.com Git - apple/icu.git/blob - icuSources/samples/ugrep/ugrep.cpp
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / samples / ugrep / ugrep.cpp
1 /**************************************************************************
2 *
3 * Copyright (C) 2002, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ***************************************************************************
7 */
8
9 //
10 // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions.
11 //
12 // The use of the ICU Regex API all occurs within the main()
13 // function. The rest of the code deals with with opening files,
14 // encoding conversions, printing results, etc.
15 //
16 // This is not a full-featured grep program. The command line options
17 // have been kept to a minimum to avoid complicating the sample code.
18 //
19
20
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25
26 #include "unicode/utypes.h"
27 #include "unicode/ustring.h"
28 #include "unicode/regex.h"
29 #include "unicode/ucnv.h"
30 #include "unicode/uclean.h"
31
32
33 //
34 // The following variables contain paramters that may be set from the command line.
35 //
36 const char *pattern = NULL; // The regular expression
37 int firstFileNum; // argv index of the first file name
38 UBool displayFileName = FALSE;
39 UBool displayLineNum = FALSE;
40
41
42 //
43 // Info regarding the file currently being processed
44 //
45 const char *fileName;
46 int fileLen; // Length, in UTF-16 Code Units.
47
48 UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads
49 // the whole file at once.
50
51 char *charBuf = 0; // Buffer, for original, unconverted file data.
52
53
54 //
55 // Info regarding the line currently being processed
56 //
57 int lineStart; // Index of first char of the current line in the file buffer
58 int lineEnd; // Index of char following the new line sequence for the current line
59 int lineNum;
60
61 //
62 // Converter, used on output to convert Unicode data back to char *
63 // so that it will display in non-Unicode terminal windows.
64 //
65 UConverter *outConverter = 0;
66
67 //
68 // Function forward declarations
69 //
70 void processOptions(int argc, const char **argv);
71 void nextLine(int start);
72 void printMatch();
73 void printUsage();
74 void readFile(const char *name);
75
76
77
78 //------------------------------------------------------------------------------------------
79 //
80 // main for ugrep
81 //
82 // Structurally, all use of the ICU Regular Expression API is in main(),
83 // and all of the supporting stuff necessary to make a running program, but
84 // not directly related to regular expressions, is factored out into these other
85 // functions.
86 //
87 //------------------------------------------------------------------------------------------
88 int main(int argc, const char** argv) {
89 UBool matchFound = FALSE;
90
91 //
92 // Process the commmand line options.
93 //
94 processOptions(argc, argv);
95
96 //
97 // Create a RegexPattern object from the user supplied pattern string.
98 //
99 UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure
100 // in a status variable.
101
102 UParseError parseErr; // In the event of a syntax error in the regex pattern,
103 // this struct will contain the position of the
104 // error.
105
106 RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status);
107 // Note that C++ is doing an automatic conversion
108 // of the (char *) pattern to a temporary
109 // UnicodeString object.
110 if (U_FAILURE(status)) {
111 fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n",
112 u_errorName(status), parseErr.offset);
113 exit(-1);
114 }
115
116 //
117 // Create a RegexMatcher from the newly created pattern.
118 //
119 UnicodeString empty;
120 RegexMatcher *matcher = rePat->matcher(empty, status);
121 if (U_FAILURE(status)) {
122 fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n",
123 u_errorName(status));
124 exit(-1);
125 }
126
127 //
128 // Loop, processing each of the input files.
129 //
130 for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
131 readFile(argv[fileNum]);
132
133 //
134 // Loop through the lines of a file, trying to match the regex pattern on each.
135 //
136 for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
137 UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
138 matcher->reset(s);
139 if (matcher->find()) {
140 matchFound = TRUE;
141 printMatch();
142 }
143 }
144 }
145
146 //
147 // Clean up
148 //
149 delete matcher;
150 delete rePat;
151 free(ucharBuf);
152 free(charBuf);
153 ucnv_close(outConverter);
154
155 u_cleanup(); // shut down ICU, release any cached data it owns.
156
157 return matchFound? 0: 1;
158 }
159
160
161
162 //------------------------------------------------------------------------------------------
163 //
164 // doOptions Run through the command line options, and set
165 // the global variables accordingly.
166 //
167 // exit without returning if an error occured and
168 // ugrep should not proceed further.
169 //
170 //------------------------------------------------------------------------------------------
171 void processOptions(int argc, const char **argv) {
172 int optInd;
173 UBool doUsage = FALSE;
174 UBool doVersion = FALSE;
175 const char *arg;
176
177
178 for(optInd = 1; optInd < argc; ++optInd) {
179 arg = argv[optInd];
180
181 /* version info */
182 if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
183 doVersion = TRUE;
184 }
185 /* usage info */
186 else if(strcmp(arg, "--help") == 0) {
187 doUsage = TRUE;
188 }
189 else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
190 displayLineNum = TRUE;
191 }
192 /* POSIX.1 says all arguments after -- are not options */
193 else if(strcmp(arg, "--") == 0) {
194 /* skip the -- */
195 ++optInd;
196 break;
197 }
198 /* unrecognized option */
199 else if(strncmp(arg, "-", strlen("-")) == 0) {
200 printf("ugrep: invalid option -- %s\n", arg+1);
201 doUsage = TRUE;
202 }
203 /* done with options */
204 else {
205 break;
206 }
207 }
208
209 if (doUsage) {
210 printUsage();
211 exit(0);
212 }
213
214 if (doVersion) {
215 printf("ugrep version 0.01\n");
216 if (optInd == argc) {
217 exit(0);
218 }
219 }
220
221 int remainingArgs = argc-optInd; // pattern file ...
222 if (remainingArgs < 2) {
223 fprintf(stderr, "ugrep: files or pattern are missing.\n");
224 printUsage();
225 exit(1);
226 }
227
228 if (remainingArgs > 2) {
229 // More than one file to be processed. Display file names with match output.
230 displayFileName = TRUE;
231 }
232
233 pattern = argv[optInd];
234 firstFileNum = optInd+1;
235 }
236
237 //------------------------------------------------------------------------------------------
238 //
239 // printUsage
240 //
241 //------------------------------------------------------------------------------------------
242 void printUsage() {
243 printf("ugrep [options] pattern file...\n"
244 " -V or --version display version information\n"
245 " --help display this help and exit\n"
246 " -- stop further option processing\n"
247 "-n, --line-number Prefix each line of output with the line number within its input file.\n"
248 );
249 exit(0);
250 }
251
252 //------------------------------------------------------------------------------------------
253 //
254 // readFile Read a file into memory, and convert it to Unicode.
255 //
256 // Since this is just a demo program, take the simple minded approach
257 // of always reading the whole file at once. No intelligent buffering
258 // is done.
259 //
260 //------------------------------------------------------------------------------------------
261 void readFile(const char *name) {
262
263 //
264 // Initialize global file variables
265 //
266 fileName = name;
267 fileLen = 0; // zero length prevents processing in case of errors.
268
269
270 //
271 // Open the file and determine its size.
272 //
273 FILE *file = fopen(name, "rb");
274 if (file == 0 ) {
275 fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
276 return;
277 }
278 fseek(file, 0, SEEK_END);
279 int rawFileLen = ftell(file);
280 fseek(file, 0, SEEK_SET);
281
282
283 //
284 // Read in the file
285 //
286 charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking...
287 int t = fread(charBuf, 1, rawFileLen, file);
288 if (t != rawFileLen) {
289 fprintf(stderr, "Error reading file \"%s\"\n", fileName);
290 return;
291 }
292 charBuf[rawFileLen]=0;
293 fclose(file);
294
295 //
296 // Look for a Unicode Signature (BOM) in the data
297 //
298 int32_t signatureLength;
299 const char * charDataStart = charBuf;
300 UErrorCode status = U_ZERO_ERROR;
301 const char* encoding = ucnv_detectUnicodeSignature(
302 charDataStart, rawFileLen, &signatureLength, &status);
303 if (U_FAILURE(status)) {
304 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
305 u_errorName(status));
306 return;
307 }
308 if(encoding!=NULL ){
309 charDataStart += signatureLength;
310 rawFileLen -= signatureLength;
311 }
312
313 //
314 // Open a converter to take the file to UTF-16
315 //
316 UConverter* conv;
317 conv = ucnv_open(encoding, &status);
318 if (U_FAILURE(status)) {
319 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
320 return;
321 }
322
323 //
324 // Convert the file data to UChar.
325 // Preflight first to determine required buffer size.
326 //
327 uint32_t destCap = ucnv_toUChars(conv,
328 NULL, // dest,
329 0, // destCapacity,
330 charDataStart,
331 rawFileLen,
332 &status);
333 if (status != U_BUFFER_OVERFLOW_ERROR) {
334 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
335 return;
336 };
337
338 status = U_ZERO_ERROR;
339 ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
340 ucnv_toUChars(conv,
341 ucharBuf, // dest,
342 destCap+1,
343 charDataStart,
344 rawFileLen,
345 &status);
346 if (U_FAILURE(status)) {
347 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
348 return;
349 };
350 ucnv_close(conv);
351
352 //
353 // Successful conversion. Set the global size variables so that
354 // the rest of the processing will proceed for this file.
355 //
356 fileLen = destCap;
357 }
358
359
360
361
362
363 //------------------------------------------------------------------------------------------
364 //
365 // nextLine Advance the line index variables, starting at the
366 // specified position in the input file buffer, by
367 // scanning forwrd until the next end-of-line.
368 //
369 // Need to take into account all of the possible Unicode
370 // line ending sequences.
371 //
372 //------------------------------------------------------------------------------------------
373 void nextLine(int startPos) {
374 if (startPos == 0) {
375 lineNum = 0;
376 } else {
377 lineNum++;
378 }
379 lineStart = lineEnd = startPos;
380
381 for (;;) {
382 if (lineEnd >= fileLen) {
383 return;
384 }
385 UChar c = ucharBuf[lineEnd];
386 lineEnd++;
387 if (c == 0x0a || // Line Feed
388 c == 0x0c || // Form Feed
389 c == 0x0d || // Carriage Return
390 c == 0x85 || // Next Line
391 c == 0x2028 || // Line Separator
392 c == 0x2029) // Paragraph separator
393 {
394 break;
395 }
396 }
397
398 // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
399 if (lineEnd < fileLen &&
400 ucharBuf[lineEnd-1] == 0x0d &&
401 ucharBuf[lineEnd] == 0x0a)
402 {
403 lineEnd++;
404 }
405 }
406
407
408 //------------------------------------------------------------------------------------------
409 //
410 // printMatch Called when a matching line has been located.
411 // Print out the line from the file with the match, after
412 // converting it back to the default code page.
413 //
414 //------------------------------------------------------------------------------------------
415 void printMatch() {
416 char buf[2000];
417 UErrorCode status = U_ZERO_ERROR;
418
419 // If we haven't already created a converter for output, do it now.
420 if (outConverter == 0) {
421 outConverter = ucnv_open(NULL, &status);
422 if (U_FAILURE(status)) {
423 fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n",
424 u_errorName(status));
425 exit(-1);
426 }
427 };
428
429 // Convert the line to be printed back to the default 8 bit code page.
430 // If the line is too long for our buffer, just truncate it.
431 ucnv_fromUChars(outConverter,
432 buf, // destination buffer for conversion
433 sizeof(buf), // capacity of destination buffer
434 &ucharBuf[lineStart], // Input to conversion
435 lineEnd-lineStart, // number of UChars to convert
436 &status);
437 buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines.
438 // The converter null-terminates its output unless
439 // the buffer completely fills.
440
441 if (displayFileName) {
442 printf("%s:", fileName);
443 }
444 if (displayLineNum) {
445 printf("%d:", lineNum);
446 }
447 printf("%s", buf);
448 }
449