]> git.saurik.com Git - apple/icu.git/blob - icuSources/samples/ugrep/ugrep.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / samples / ugrep / ugrep.cpp
1 /**************************************************************************
2 *
3 * Copyright (C) 2002-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ***************************************************************************
7 */
8
9 //
10 // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions.
11 //
12 // The use of the ICU Regex API all occurs within the main()
13 // function. The rest of the code deals with with opening files,
14 // encoding conversions, printing results, etc.
15 //
16 // This is not a full-featured grep program. The command line options
17 // have been kept to a minimum to avoid complicating the sample code.
18 //
19
20
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25
26 #include "unicode/utypes.h"
27 #include "unicode/ustring.h"
28 #include "unicode/regex.h"
29 #include "unicode/ucnv.h"
30 #include "unicode/uclean.h"
31
32
33 //
34 // The following variables contain paramters that may be set from the command line.
35 //
36 const char *pattern = NULL; // The regular expression
37 int firstFileNum; // argv index of the first file name
38 UBool displayFileName = FALSE;
39 UBool displayLineNum = FALSE;
40
41
42 //
43 // Info regarding the file currently being processed
44 //
45 const char *fileName;
46 int fileLen; // Length, in UTF-16 Code Units.
47
48 UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads
49 // the whole file at once.
50
51 char *charBuf = 0; // Buffer, for original, unconverted file data.
52
53
54 //
55 // Info regarding the line currently being processed
56 //
57 int lineStart; // Index of first char of the current line in the file buffer
58 int lineEnd; // Index of char following the new line sequence for the current line
59 int lineNum;
60
61 //
62 // Converter, used on output to convert Unicode data back to char *
63 // so that it will display in non-Unicode terminal windows.
64 //
65 UConverter *outConverter = 0;
66
67 //
68 // Function forward declarations
69 //
70 void processOptions(int argc, const char **argv);
71 void nextLine(int start);
72 void printMatch();
73 void printUsage();
74 void readFile(const char *name);
75
76
77
78 //------------------------------------------------------------------------------------------
79 //
80 // main for ugrep
81 //
82 // Structurally, all use of the ICU Regular Expression API is in main(),
83 // and all of the supporting stuff necessary to make a running program, but
84 // not directly related to regular expressions, is factored out into these other
85 // functions.
86 //
87 //------------------------------------------------------------------------------------------
88 int main(int argc, const char** argv) {
89 UBool matchFound = FALSE;
90
91 //
92 // Process the commmand line options.
93 //
94 processOptions(argc, argv);
95
96 //
97 // Create a RegexPattern object from the user supplied pattern string.
98 //
99 UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure
100 // in a status variable.
101
102 UParseError parseErr; // In the event of a syntax error in the regex pattern,
103 // this struct will contain the position of the
104 // error.
105
106 RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status);
107 // Note that C++ is doing an automatic conversion
108 // of the (char *) pattern to a temporary
109 // UnicodeString object.
110 if (U_FAILURE(status)) {
111 fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n",
112 u_errorName(status), parseErr.offset);
113 exit(-1);
114 }
115
116 //
117 // Create a RegexMatcher from the newly created pattern.
118 //
119 UnicodeString empty;
120 RegexMatcher *matcher = rePat->matcher(empty, status);
121 if (U_FAILURE(status)) {
122 fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n",
123 u_errorName(status));
124 exit(-1);
125 }
126
127 //
128 // Loop, processing each of the input files.
129 //
130 for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
131 readFile(argv[fileNum]);
132
133 //
134 // Loop through the lines of a file, trying to match the regex pattern on each.
135 //
136 for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
137 UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
138 matcher->reset(s);
139 if (matcher->find()) {
140 matchFound = TRUE;
141 printMatch();
142 }
143 }
144 }
145
146 //
147 // Clean up
148 //
149 delete matcher;
150 delete rePat;
151 free(ucharBuf);
152 free(charBuf);
153 ucnv_close(outConverter);
154
155 u_cleanup(); // shut down ICU, release any cached data it owns.
156
157 return matchFound? 0: 1;
158 }
159
160
161
162 //------------------------------------------------------------------------------------------
163 //
164 // doOptions Run through the command line options, and set
165 // the global variables accordingly.
166 //
167 // exit without returning if an error occured and
168 // ugrep should not proceed further.
169 //
170 //------------------------------------------------------------------------------------------
171 void processOptions(int argc, const char **argv) {
172 int optInd;
173 UBool doUsage = FALSE;
174 UBool doVersion = FALSE;
175 const char *arg;
176
177
178 for(optInd = 1; optInd < argc; ++optInd) {
179 arg = argv[optInd];
180
181 /* version info */
182 if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
183 doVersion = TRUE;
184 }
185 /* usage info */
186 else if(strcmp(arg, "--help") == 0) {
187 doUsage = TRUE;
188 }
189 else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
190 displayLineNum = TRUE;
191 }
192 /* POSIX.1 says all arguments after -- are not options */
193 else if(strcmp(arg, "--") == 0) {
194 /* skip the -- */
195 ++optInd;
196 break;
197 }
198 /* unrecognized option */
199 else if(strncmp(arg, "-", strlen("-")) == 0) {
200 printf("ugrep: invalid option -- %s\n", arg+1);
201 doUsage = TRUE;
202 }
203 /* done with options */
204 else {
205 break;
206 }
207 }
208
209 if (doUsage) {
210 printUsage();
211 exit(0);
212 }
213
214 if (doVersion) {
215 printf("ugrep version 0.01\n");
216 if (optInd == argc) {
217 exit(0);
218 }
219 }
220
221 int remainingArgs = argc-optInd; // pattern file ...
222 if (remainingArgs < 2) {
223 fprintf(stderr, "ugrep: files or pattern are missing.\n");
224 printUsage();
225 exit(1);
226 }
227
228 if (remainingArgs > 2) {
229 // More than one file to be processed. Display file names with match output.
230 displayFileName = TRUE;
231 }
232
233 pattern = argv[optInd];
234 firstFileNum = optInd+1;
235 }
236
237 //------------------------------------------------------------------------------------------
238 //
239 // printUsage
240 //
241 //------------------------------------------------------------------------------------------
242 void printUsage() {
243 printf("ugrep [options] pattern file...\n"
244 " -V or --version display version information\n"
245 " --help display this help and exit\n"
246 " -- stop further option processing\n"
247 "-n, --line-number Prefix each line of output with the line number within its input file.\n"
248 );
249 exit(0);
250 }
251
252 //------------------------------------------------------------------------------------------
253 //
254 // readFile Read a file into memory, and convert it to Unicode.
255 //
256 // Since this is just a demo program, take the simple minded approach
257 // of always reading the whole file at once. No intelligent buffering
258 // is done.
259 //
260 //------------------------------------------------------------------------------------------
261 void readFile(const char *name) {
262
263 //
264 // Initialize global file variables
265 //
266 fileName = name;
267 fileLen = 0; // zero length prevents processing in case of errors.
268
269
270 //
271 // Open the file and determine its size.
272 //
273 FILE *file = fopen(name, "rb");
274 if (file == 0 ) {
275 fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
276 return;
277 }
278 fseek(file, 0, SEEK_END);
279 int rawFileLen = ftell(file);
280 fseek(file, 0, SEEK_SET);
281
282
283 //
284 // Read in the file
285 //
286 charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking...
287 int t = fread(charBuf, 1, rawFileLen, file);
288 if (t != rawFileLen) {
289 fprintf(stderr, "Error reading file \"%s\"\n", fileName);
290 fclose(file);
291 return;
292 }
293 charBuf[rawFileLen]=0;
294 fclose(file);
295
296 //
297 // Look for a Unicode Signature (BOM) in the data
298 //
299 int32_t signatureLength;
300 const char * charDataStart = charBuf;
301 UErrorCode status = U_ZERO_ERROR;
302 const char* encoding = ucnv_detectUnicodeSignature(
303 charDataStart, rawFileLen, &signatureLength, &status);
304 if (U_FAILURE(status)) {
305 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
306 u_errorName(status));
307 return;
308 }
309 if(encoding!=NULL ){
310 charDataStart += signatureLength;
311 rawFileLen -= signatureLength;
312 }
313
314 //
315 // Open a converter to take the file to UTF-16
316 //
317 UConverter* conv;
318 conv = ucnv_open(encoding, &status);
319 if (U_FAILURE(status)) {
320 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
321 return;
322 }
323
324 //
325 // Convert the file data to UChar.
326 // Preflight first to determine required buffer size.
327 //
328 uint32_t destCap = ucnv_toUChars(conv,
329 NULL, // dest,
330 0, // destCapacity,
331 charDataStart,
332 rawFileLen,
333 &status);
334 if (status != U_BUFFER_OVERFLOW_ERROR) {
335 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
336 return;
337 };
338
339 status = U_ZERO_ERROR;
340 ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
341 ucnv_toUChars(conv,
342 ucharBuf, // dest,
343 destCap+1,
344 charDataStart,
345 rawFileLen,
346 &status);
347 if (U_FAILURE(status)) {
348 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
349 return;
350 };
351 ucnv_close(conv);
352
353 //
354 // Successful conversion. Set the global size variables so that
355 // the rest of the processing will proceed for this file.
356 //
357 fileLen = destCap;
358 }
359
360
361
362
363
364 //------------------------------------------------------------------------------------------
365 //
366 // nextLine Advance the line index variables, starting at the
367 // specified position in the input file buffer, by
368 // scanning forwrd until the next end-of-line.
369 //
370 // Need to take into account all of the possible Unicode
371 // line ending sequences.
372 //
373 //------------------------------------------------------------------------------------------
374 void nextLine(int startPos) {
375 if (startPos == 0) {
376 lineNum = 0;
377 } else {
378 lineNum++;
379 }
380 lineStart = lineEnd = startPos;
381
382 for (;;) {
383 if (lineEnd >= fileLen) {
384 return;
385 }
386 UChar c = ucharBuf[lineEnd];
387 lineEnd++;
388 if (c == 0x0a || // Line Feed
389 c == 0x0c || // Form Feed
390 c == 0x0d || // Carriage Return
391 c == 0x85 || // Next Line
392 c == 0x2028 || // Line Separator
393 c == 0x2029) // Paragraph separator
394 {
395 break;
396 }
397 }
398
399 // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
400 if (lineEnd < fileLen &&
401 ucharBuf[lineEnd-1] == 0x0d &&
402 ucharBuf[lineEnd] == 0x0a)
403 {
404 lineEnd++;
405 }
406 }
407
408
409 //------------------------------------------------------------------------------------------
410 //
411 // printMatch Called when a matching line has been located.
412 // Print out the line from the file with the match, after
413 // converting it back to the default code page.
414 //
415 //------------------------------------------------------------------------------------------
416 void printMatch() {
417 char buf[2000];
418 UErrorCode status = U_ZERO_ERROR;
419
420 // If we haven't already created a converter for output, do it now.
421 if (outConverter == 0) {
422 outConverter = ucnv_open(NULL, &status);
423 if (U_FAILURE(status)) {
424 fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n",
425 u_errorName(status));
426 exit(-1);
427 }
428 };
429
430 // Convert the line to be printed back to the default 8 bit code page.
431 // If the line is too long for our buffer, just truncate it.
432 ucnv_fromUChars(outConverter,
433 buf, // destination buffer for conversion
434 sizeof(buf), // capacity of destination buffer
435 &ucharBuf[lineStart], // Input to conversion
436 lineEnd-lineStart, // number of UChars to convert
437 &status);
438 buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines.
439 // The converter null-terminates its output unless
440 // the buffer completely fills.
441
442 if (displayFileName) {
443 printf("%s:", fileName);
444 }
445 if (displayLineNum) {
446 printf("%d:", lineNum);
447 }
448 printf("%s", buf);
449 }
450