]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | /************************************************************************* |
2 | * | |
3 | * © 2016 and later: Unicode, Inc. and others. | |
4 | * License & terms of use: http://www.unicode.org/copyright.html#License | |
5 | * | |
6 | ************************************************************************** | |
7 | ************************************************************************** | |
b75a7d8f | 8 | * |
729e4ab9 | 9 | * Copyright (C) 2002-2010, International Business Machines |
b75a7d8f A |
10 | * Corporation and others. All Rights Reserved. |
11 | * | |
12 | *************************************************************************** | |
13 | */ | |
14 | ||
15 | // | |
16 | // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions. | |
17 | // | |
18 | // The use of the ICU Regex API all occurs within the main() | |
0f5d89e8 | 19 | // function. The rest of the code deals with opening files, |
b75a7d8f A |
20 | // encoding conversions, printing results, etc. |
21 | // | |
22 | // This is not a full-featured grep program. The command line options | |
23 | // have been kept to a minimum to avoid complicating the sample code. | |
24 | // | |
25 | ||
26 | ||
27 | ||
28 | #include <stdio.h> | |
29 | #include <stdlib.h> | |
30 | #include <string.h> | |
31 | ||
32 | #include "unicode/utypes.h" | |
33 | #include "unicode/ustring.h" | |
34 | #include "unicode/regex.h" | |
35 | #include "unicode/ucnv.h" | |
36 | #include "unicode/uclean.h" | |
37 | ||
0f5d89e8 | 38 | using namespace icu; |
b75a7d8f A |
39 | |
40 | // | |
0f5d89e8 | 41 | // The following variables contain parameters that may be set from the command line. |
b75a7d8f A |
42 | // |
43 | const char *pattern = NULL; // The regular expression | |
44 | int firstFileNum; // argv index of the first file name | |
45 | UBool displayFileName = FALSE; | |
46 | UBool displayLineNum = FALSE; | |
47 | ||
48 | ||
49 | // | |
50 | // Info regarding the file currently being processed | |
51 | // | |
52 | const char *fileName; | |
53 | int fileLen; // Length, in UTF-16 Code Units. | |
54 | ||
55 | UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads | |
56 | // the whole file at once. | |
57 | ||
58 | char *charBuf = 0; // Buffer, for original, unconverted file data. | |
59 | ||
60 | ||
61 | // | |
62 | // Info regarding the line currently being processed | |
63 | // | |
64 | int lineStart; // Index of first char of the current line in the file buffer | |
65 | int lineEnd; // Index of char following the new line sequence for the current line | |
66 | int lineNum; | |
67 | ||
68 | // | |
69 | // Converter, used on output to convert Unicode data back to char * | |
70 | // so that it will display in non-Unicode terminal windows. | |
71 | // | |
72 | UConverter *outConverter = 0; | |
73 | ||
74 | // | |
75 | // Function forward declarations | |
76 | // | |
77 | void processOptions(int argc, const char **argv); | |
78 | void nextLine(int start); | |
79 | void printMatch(); | |
80 | void printUsage(); | |
81 | void readFile(const char *name); | |
82 | ||
83 | ||
84 | ||
85 | //------------------------------------------------------------------------------------------ | |
86 | // | |
87 | // main for ugrep | |
88 | // | |
89 | // Structurally, all use of the ICU Regular Expression API is in main(), | |
90 | // and all of the supporting stuff necessary to make a running program, but | |
91 | // not directly related to regular expressions, is factored out into these other | |
92 | // functions. | |
93 | // | |
94 | //------------------------------------------------------------------------------------------ | |
95 | int main(int argc, const char** argv) { | |
96 | UBool matchFound = FALSE; | |
97 | ||
98 | // | |
0f5d89e8 | 99 | // Process the command line options. |
b75a7d8f A |
100 | // |
101 | processOptions(argc, argv); | |
102 | ||
103 | // | |
104 | // Create a RegexPattern object from the user supplied pattern string. | |
105 | // | |
106 | UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure | |
107 | // in a status variable. | |
108 | ||
109 | UParseError parseErr; // In the event of a syntax error in the regex pattern, | |
110 | // this struct will contain the position of the | |
111 | // error. | |
112 | ||
113 | RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status); | |
114 | // Note that C++ is doing an automatic conversion | |
115 | // of the (char *) pattern to a temporary | |
116 | // UnicodeString object. | |
117 | if (U_FAILURE(status)) { | |
118 | fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n", | |
119 | u_errorName(status), parseErr.offset); | |
120 | exit(-1); | |
121 | } | |
122 | ||
123 | // | |
124 | // Create a RegexMatcher from the newly created pattern. | |
125 | // | |
126 | UnicodeString empty; | |
127 | RegexMatcher *matcher = rePat->matcher(empty, status); | |
128 | if (U_FAILURE(status)) { | |
129 | fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n", | |
130 | u_errorName(status)); | |
131 | exit(-1); | |
132 | } | |
133 | ||
134 | // | |
135 | // Loop, processing each of the input files. | |
136 | // | |
137 | for (int fileNum=firstFileNum; fileNum < argc; fileNum++) { | |
138 | readFile(argv[fileNum]); | |
139 | ||
140 | // | |
141 | // Loop through the lines of a file, trying to match the regex pattern on each. | |
142 | // | |
143 | for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) { | |
144 | UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart); | |
145 | matcher->reset(s); | |
146 | if (matcher->find()) { | |
147 | matchFound = TRUE; | |
148 | printMatch(); | |
149 | } | |
150 | } | |
151 | } | |
152 | ||
153 | // | |
154 | // Clean up | |
155 | // | |
156 | delete matcher; | |
157 | delete rePat; | |
158 | free(ucharBuf); | |
159 | free(charBuf); | |
160 | ucnv_close(outConverter); | |
161 | ||
162 | u_cleanup(); // shut down ICU, release any cached data it owns. | |
163 | ||
164 | return matchFound? 0: 1; | |
165 | } | |
166 | ||
167 | ||
168 | ||
169 | //------------------------------------------------------------------------------------------ | |
170 | // | |
171 | // doOptions Run through the command line options, and set | |
172 | // the global variables accordingly. | |
173 | // | |
0f5d89e8 | 174 | // exit without returning if an error occurred and |
b75a7d8f A |
175 | // ugrep should not proceed further. |
176 | // | |
177 | //------------------------------------------------------------------------------------------ | |
178 | void processOptions(int argc, const char **argv) { | |
179 | int optInd; | |
180 | UBool doUsage = FALSE; | |
181 | UBool doVersion = FALSE; | |
182 | const char *arg; | |
183 | ||
184 | ||
185 | for(optInd = 1; optInd < argc; ++optInd) { | |
186 | arg = argv[optInd]; | |
187 | ||
188 | /* version info */ | |
189 | if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) { | |
190 | doVersion = TRUE; | |
191 | } | |
192 | /* usage info */ | |
193 | else if(strcmp(arg, "--help") == 0) { | |
194 | doUsage = TRUE; | |
195 | } | |
196 | else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) { | |
197 | displayLineNum = TRUE; | |
198 | } | |
199 | /* POSIX.1 says all arguments after -- are not options */ | |
200 | else if(strcmp(arg, "--") == 0) { | |
201 | /* skip the -- */ | |
202 | ++optInd; | |
203 | break; | |
204 | } | |
205 | /* unrecognized option */ | |
206 | else if(strncmp(arg, "-", strlen("-")) == 0) { | |
207 | printf("ugrep: invalid option -- %s\n", arg+1); | |
208 | doUsage = TRUE; | |
209 | } | |
210 | /* done with options */ | |
211 | else { | |
212 | break; | |
213 | } | |
214 | } | |
215 | ||
216 | if (doUsage) { | |
217 | printUsage(); | |
218 | exit(0); | |
219 | } | |
220 | ||
221 | if (doVersion) { | |
222 | printf("ugrep version 0.01\n"); | |
223 | if (optInd == argc) { | |
224 | exit(0); | |
225 | } | |
226 | } | |
227 | ||
228 | int remainingArgs = argc-optInd; // pattern file ... | |
229 | if (remainingArgs < 2) { | |
230 | fprintf(stderr, "ugrep: files or pattern are missing.\n"); | |
231 | printUsage(); | |
232 | exit(1); | |
233 | } | |
234 | ||
235 | if (remainingArgs > 2) { | |
236 | // More than one file to be processed. Display file names with match output. | |
237 | displayFileName = TRUE; | |
238 | } | |
239 | ||
240 | pattern = argv[optInd]; | |
241 | firstFileNum = optInd+1; | |
242 | } | |
243 | ||
244 | //------------------------------------------------------------------------------------------ | |
245 | // | |
246 | // printUsage | |
247 | // | |
248 | //------------------------------------------------------------------------------------------ | |
249 | void printUsage() { | |
250 | printf("ugrep [options] pattern file...\n" | |
251 | " -V or --version display version information\n" | |
252 | " --help display this help and exit\n" | |
253 | " -- stop further option processing\n" | |
254 | "-n, --line-number Prefix each line of output with the line number within its input file.\n" | |
255 | ); | |
256 | exit(0); | |
257 | } | |
258 | ||
259 | //------------------------------------------------------------------------------------------ | |
260 | // | |
261 | // readFile Read a file into memory, and convert it to Unicode. | |
262 | // | |
263 | // Since this is just a demo program, take the simple minded approach | |
264 | // of always reading the whole file at once. No intelligent buffering | |
265 | // is done. | |
266 | // | |
267 | //------------------------------------------------------------------------------------------ | |
268 | void readFile(const char *name) { | |
269 | ||
270 | // | |
271 | // Initialize global file variables | |
272 | // | |
273 | fileName = name; | |
274 | fileLen = 0; // zero length prevents processing in case of errors. | |
275 | ||
276 | ||
277 | // | |
278 | // Open the file and determine its size. | |
279 | // | |
280 | FILE *file = fopen(name, "rb"); | |
281 | if (file == 0 ) { | |
282 | fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName); | |
283 | return; | |
284 | } | |
285 | fseek(file, 0, SEEK_END); | |
286 | int rawFileLen = ftell(file); | |
287 | fseek(file, 0, SEEK_SET); | |
288 | ||
289 | ||
290 | // | |
291 | // Read in the file | |
292 | // | |
293 | charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking... | |
0f5d89e8 | 294 | int t = static_cast<int>(fread(charBuf, 1, rawFileLen, file)); |
b75a7d8f A |
295 | if (t != rawFileLen) { |
296 | fprintf(stderr, "Error reading file \"%s\"\n", fileName); | |
729e4ab9 | 297 | fclose(file); |
b75a7d8f A |
298 | return; |
299 | } | |
300 | charBuf[rawFileLen]=0; | |
301 | fclose(file); | |
302 | ||
303 | // | |
304 | // Look for a Unicode Signature (BOM) in the data | |
305 | // | |
306 | int32_t signatureLength; | |
307 | const char * charDataStart = charBuf; | |
308 | UErrorCode status = U_ZERO_ERROR; | |
309 | const char* encoding = ucnv_detectUnicodeSignature( | |
310 | charDataStart, rawFileLen, &signatureLength, &status); | |
311 | if (U_FAILURE(status)) { | |
312 | fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n", | |
313 | u_errorName(status)); | |
314 | return; | |
315 | } | |
316 | if(encoding!=NULL ){ | |
317 | charDataStart += signatureLength; | |
318 | rawFileLen -= signatureLength; | |
319 | } | |
320 | ||
321 | // | |
322 | // Open a converter to take the file to UTF-16 | |
323 | // | |
324 | UConverter* conv; | |
325 | conv = ucnv_open(encoding, &status); | |
326 | if (U_FAILURE(status)) { | |
327 | fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status)); | |
328 | return; | |
329 | } | |
330 | ||
331 | // | |
332 | // Convert the file data to UChar. | |
333 | // Preflight first to determine required buffer size. | |
334 | // | |
335 | uint32_t destCap = ucnv_toUChars(conv, | |
336 | NULL, // dest, | |
337 | 0, // destCapacity, | |
338 | charDataStart, | |
339 | rawFileLen, | |
340 | &status); | |
341 | if (status != U_BUFFER_OVERFLOW_ERROR) { | |
342 | fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); | |
343 | return; | |
344 | }; | |
345 | ||
346 | status = U_ZERO_ERROR; | |
347 | ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar)); | |
348 | ucnv_toUChars(conv, | |
349 | ucharBuf, // dest, | |
350 | destCap+1, | |
351 | charDataStart, | |
352 | rawFileLen, | |
353 | &status); | |
354 | if (U_FAILURE(status)) { | |
355 | fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); | |
356 | return; | |
357 | }; | |
358 | ucnv_close(conv); | |
359 | ||
360 | // | |
361 | // Successful conversion. Set the global size variables so that | |
362 | // the rest of the processing will proceed for this file. | |
363 | // | |
364 | fileLen = destCap; | |
365 | } | |
366 | ||
367 | ||
368 | ||
369 | ||
370 | ||
371 | //------------------------------------------------------------------------------------------ | |
372 | // | |
373 | // nextLine Advance the line index variables, starting at the | |
374 | // specified position in the input file buffer, by | |
0f5d89e8 | 375 | // scanning forward until the next end-of-line. |
b75a7d8f A |
376 | // |
377 | // Need to take into account all of the possible Unicode | |
378 | // line ending sequences. | |
379 | // | |
380 | //------------------------------------------------------------------------------------------ | |
381 | void nextLine(int startPos) { | |
382 | if (startPos == 0) { | |
383 | lineNum = 0; | |
384 | } else { | |
385 | lineNum++; | |
386 | } | |
387 | lineStart = lineEnd = startPos; | |
388 | ||
389 | for (;;) { | |
390 | if (lineEnd >= fileLen) { | |
391 | return; | |
392 | } | |
393 | UChar c = ucharBuf[lineEnd]; | |
394 | lineEnd++; | |
395 | if (c == 0x0a || // Line Feed | |
396 | c == 0x0c || // Form Feed | |
397 | c == 0x0d || // Carriage Return | |
398 | c == 0x85 || // Next Line | |
399 | c == 0x2028 || // Line Separator | |
400 | c == 0x2029) // Paragraph separator | |
401 | { | |
402 | break; | |
403 | } | |
404 | } | |
405 | ||
406 | // Check for CR/LF sequence, and advance over the LF if we're in the middle of one. | |
407 | if (lineEnd < fileLen && | |
408 | ucharBuf[lineEnd-1] == 0x0d && | |
409 | ucharBuf[lineEnd] == 0x0a) | |
410 | { | |
411 | lineEnd++; | |
412 | } | |
413 | } | |
414 | ||
415 | ||
416 | //------------------------------------------------------------------------------------------ | |
417 | // | |
418 | // printMatch Called when a matching line has been located. | |
419 | // Print out the line from the file with the match, after | |
420 | // converting it back to the default code page. | |
421 | // | |
422 | //------------------------------------------------------------------------------------------ | |
423 | void printMatch() { | |
424 | char buf[2000]; | |
425 | UErrorCode status = U_ZERO_ERROR; | |
426 | ||
427 | // If we haven't already created a converter for output, do it now. | |
428 | if (outConverter == 0) { | |
429 | outConverter = ucnv_open(NULL, &status); | |
430 | if (U_FAILURE(status)) { | |
431 | fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n", | |
432 | u_errorName(status)); | |
433 | exit(-1); | |
434 | } | |
435 | }; | |
436 | ||
437 | // Convert the line to be printed back to the default 8 bit code page. | |
438 | // If the line is too long for our buffer, just truncate it. | |
439 | ucnv_fromUChars(outConverter, | |
440 | buf, // destination buffer for conversion | |
441 | sizeof(buf), // capacity of destination buffer | |
442 | &ucharBuf[lineStart], // Input to conversion | |
443 | lineEnd-lineStart, // number of UChars to convert | |
444 | &status); | |
445 | buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines. | |
446 | // The converter null-terminates its output unless | |
447 | // the buffer completely fills. | |
448 | ||
449 | if (displayFileName) { | |
450 | printf("%s:", fileName); | |
451 | } | |
452 | if (displayLineNum) { | |
453 | printf("%d:", lineNum); | |
454 | } | |
455 | printf("%s", buf); | |
456 | } | |
457 |