]> git.saurik.com Git - wxWidgets.git/blob - tests/regex/regex.pl
Remove all lines containing cvs/svn "$Id$" keyword.
[wxWidgets.git] / tests / regex / regex.pl
1 #!/usr/bin/env perl
2 #############################################################################
3 # Name: regex.pl
4 # Purpose: Generate test code for wxRegEx from 'reg.test'
5 # Author: Mike Wetherell
6 # Copyright: (c) Mike Wetherell
7 # Licence: wxWindows licence
8 #############################################################################
9
10 #
11 # Notes:
12 # See './regex.pl -h' for usage
13 #
14 # Output at the moment is C++ using the cppunit testing framework. The
15 # language/framework specifics are separated, with the following 5
16 # subs as an interface: 'begin_output', 'begin_section', 'write_test',
17 # 'end_section' and 'end_output'. So for a different language/framework,
18 # implement 5 new similar subs.
19 #
20 # I've avoided using 'use encoding "UTF-8"', since this wasn't available
21 # in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
22 # earler than perl 5.6.0 aren't going to work.
23 #
24
25 use strict;
26 use warnings;
27 use File::Basename;
28 #use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory
29
30 # if 0 output is wide characters, if 1 output is utf8 encoded
31 my $utf = 1;
32
33 # quote a parameter (C++ helper)
34 #
35 sub quotecxx {
36 my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
37 "\n" => "n", "\r" => "r", "\t" => "t",
38 "\013" => "v", '"' => '"', "\\" => "\\" );
39
40 # working around lack of 'use encoding'
41 if (!$utf) {
42 $_ = pack "U0C*", unpack "C*", $_;
43 use utf8;
44 }
45
46 s/[\000-\037"\\\177-\x{ffff}]/
47 if ($esc{$&}) {
48 "\\$esc{$&}";
49 } elsif (ord($&) > 0x9f && !$utf) {
50 sprintf "\\u%04x", ord($&);
51 } else {
52 sprintf "\\%03o", ord($&);
53 }
54 /ge;
55
56 # working around lack of 'use encoding'
57 if (!$utf) {
58 no utf8;
59 $_ = pack "C*", unpack "C*", $_;
60 }
61
62 return ($utf ? '"' : 'L"') . $_ . '"'
63 }
64
65 # start writing the output code (C++ interface)
66 #
67 sub begin_output {
68 my ($from, $instructions) = @_;
69
70 # embed it in the comment
71 $from = "\n$from";
72 $from =~ s/^(?: )?/ * /mg;
73
74 # $instructions contains information about the flags etc.
75 if ($instructions) {
76 $instructions = "\n$instructions";
77 $instructions =~ s/^(?: )?/ * /mg;
78 }
79
80 my $u = $utf ? " (UTF-8 encoded)" : "";
81
82 print <<EOT;
83 /*
84 * Test data for wxRegEx$u
85 $from$instructions */
86
87 EOT
88 }
89
90 my @classes;
91
92 # start a new section (C++ interface)
93 #
94 sub begin_section {
95 my ($id, $title) = @_;
96 my $class = "regextest_$id";
97 $class =~ s/\W/_/g;
98 push @classes, [$id, $class];
99
100 print <<EOT;
101
102 /*
103 * $id $title
104 */
105
106 class $class : public RegExTestSuite
107 {
108 public:
109 $class() : RegExTestSuite("regex.$id") { }
110 static Test *suite();
111 };
112
113 Test *$class\::suite()
114 {
115 RegExTestSuite *suite = new $class;
116
117 EOT
118 }
119
120 # output a test line (C++ interface)
121 #
122 sub write_test {
123 my @args = @_;
124 $_ = quotecxx for @args;
125 print " suite->add(" . (join ', ', @args) . ", NULL);\n";
126 }
127
128 # end a section (C++ interface)
129 #
130 sub end_section {
131 my ($id, $class) = @{$classes[$#classes]};
132
133 print <<EOT;
134
135 return suite;
136 }
137
138 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
139
140 EOT
141 }
142
143 # finish off the output (C++ interface)
144 #
145 sub end_output {
146 print <<EOT;
147
148 /*
149 * A suite containing all the above suites
150 */
151
152 class regextest : public TestSuite
153 {
154 public:
155 regextest() : TestSuite("regex") { }
156 static Test *suite();
157 };
158
159 Test *regextest::suite()
160 {
161 TestSuite *suite = new regextest;
162
163 EOT
164 print " suite->addTest(".$_->[1]."::suite());\n" for @classes;
165
166 print <<EOT;
167
168 return suite;
169 }
170
171 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
172 CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
173 EOT
174 }
175
176 # Parse a tcl string. Handles curly quoting and double quoting.
177 #
178 sub parsetcl {
179 my ($curly, $quote);
180 # recursively defined expression that can parse balanced braces
181 # warning: uses experimental features of perl, see perlop(1)
182 $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
183 $quote = qr/"(?:\\"|[^"])*"/;
184 my @tokens = shift =~ /($curly|$quote|\S+)/g;
185
186 # now remove braces/quotes and unescape any escapes
187 for (@tokens) {
188 if (s/^{(.*)}$/$1/) {
189 # for curly quoting, only unescape \{ and \}
190 s/\\([{}])/$1/g;
191 } else {
192 s/^"(.*)"$/$1/;
193
194 # unescape any escapes
195 my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
196 "n" => "\n", "r" => "\r", "t" => "\t",
197 "v" => "\013" );
198 my $x = qr/[[:xdigit:]]/;
199
200 s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
201 if ($1 =~ m{^([0-7]+)}) {
202 chr(oct($1));
203 } elsif ($1 =~ m{^x($x+)}) {
204 pack("C0U", hex($1) & 0xff);
205 } elsif ($1 =~ m{^u($x+)}) {
206 pack("C0U", hex($1));
207 } elsif ($esc{$1}) {
208 $esc{$1};
209 } else {
210 $1;
211 }
212 /ge;
213 }
214 }
215
216 return @tokens;
217 }
218
219 # helpers which keep track of whether begin_section has been called, so that
220 # end_section can be called when appropriate
221 #
222 my @doing = ("0", "");
223 my $in_section = 0;
224
225 sub handle_doing {
226 end_section if $in_section;
227 $in_section = 0;
228 @doing = @_;
229 }
230
231 sub handle_test {
232 begin_section(@doing) if !$in_section;
233 $in_section = 1;
234 write_test @_;
235 }
236
237 sub handle_end {
238 end_section if $in_section;
239 $in_section = 0;
240 end_output;
241 }
242
243 # 'main' - start by parsing the command lines options.
244 #
245 my $badoption = !@ARGV;
246 my $utfdefault = $utf;
247 my $outputname;
248
249 for (my $i = 0; $i < @ARGV; ) {
250 if ($ARGV[$i] !~ m{^-.}) {
251 $i++;
252 next;
253 }
254
255 if ($ARGV[$i] eq '--') {
256 splice @ARGV, $i, 1;
257 last;
258 }
259
260 if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) { # -o : output file
261 $outputname = $2 || splice @ARGV, $i + 1, 1;
262 }
263
264 for (split //, substr($ARGV[$i], 1)) {
265 if (/u/i) { # -u : utf-8 output
266 $utf = 1;
267 } elsif (/w/i) { # -w : wide char output
268 $utf = 0;
269 } else {
270 $badoption = 1;
271 }
272 }
273
274 splice @ARGV, $i, 1;
275 }
276
277 # Display help
278 #
279 if ($badoption) {
280 my $prog = basename $0;
281 my ($w, $u) = (" (default)", " ");
282 ($w, $u) = ($u, $w) if $utfdefault;
283
284 print <<EOT;
285 Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
286 Generate test code for wxRegEx from 'reg.test'
287 Example: $prog -o regex.inc reg.test wxreg.test
288
289 -w$w Output will be wide characters.
290 -u$u Output will be UTF-8 encoded.
291
292 Input files should be in UTF-8. If no input files are specified input is
293 read from stdin. If no output file is specified output is written to stdout.
294 See the comments in reg.test for details of the input file format.
295 EOT
296 exit 0;
297 }
298
299 # Open the output file
300 #
301 open STDOUT, ">$outputname" if $outputname;
302
303 # Read in the files and initially parse just the comments for copyright
304 # information and instructions on the tests
305 #
306 my @input; # slurped input files stripped of comments
307 my $files = ""; # copyright info from the input comments
308 my $instructions = ""; # test instructions from the input comments
309
310 do {
311 my $inputname = basename $ARGV[0] if @ARGV;
312
313 # slurp input
314 undef $/;
315 my $in = <>;
316
317 # remove escaped newlines
318 $in =~ s/(?<!\\)\\\n//g;
319
320 # record the copyrights of the input files
321 for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
322 s/[\s:]+/ /g;
323 $files .= " ";
324 $files .= $inputname . ": " if $inputname && $inputname ne '-';
325 $files .= "$_\n";
326 }
327
328 # Parse the comments for instructions on the tests, which look like this:
329 # i successful match with -indices (used in checking things like
330 # nonparticipating subexpressions)
331 if (!$instructions) {
332 my $sp = qr{\t| +}; # tab or three or more spaces
333 my @instructions = $in =~
334 /\n(
335 (?:
336 \#$sp\S?$sp\S[^\n]+\n # instruction line
337 (?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any)
338 )+
339 )/gx;
340
341 if (@instructions) {
342 $instructions[0] = "Test types:\n$instructions[0]";
343 if (@instructions > 1) {
344 $instructions[1] = "Flag characters:\n$instructions[1]";
345 }
346 $instructions = join "\n", @instructions;
347 $instructions =~ s/^#([^\t]?)/ $1/mg;
348 }
349 }
350
351 # @input is the input of all files (stipped of comments)
352 $in =~ s/^#.*$//mg;
353 push @input, $in;
354
355 } while $ARGV[0];
356
357 # Make a string naming the generator, the input files and copyright info
358 #
359 my $from = "Generated " . localtime() . " by " . basename $0;
360 $from =~ s/[\s]+/ /g;
361 if ($files) {
362 if ($files =~ /:/) {
363 $from .= " from the following files:";
364 } else {
365 $from .= " from work with the following copyright:";
366 }
367 }
368 $from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g); # word-wrap
369 $from .= "\n$files" if $files;
370
371 # Now start to print the code
372 #
373 begin_output $from, $instructions;
374
375 # numbers for 'extra' sections
376 my $extra = 1;
377
378 for (@input)
379 {
380 # Print the main tests
381 #
382 # Test lines look like this:
383 # m 3 b {\(a\)b} ab ab a
384 #
385 # Also looks for heading lines, e.g.:
386 # doing 4 "parentheses"
387 #
388 for (split "\n") {
389 if (/^doing\s+(\S+)\s+(\S.*)/) {
390 handle_doing parsetcl "$1 $2";
391 } elsif (/^[efimp]\s/) {
392 handle_test parsetcl $_;
393 }
394 }
395
396 # Extra tests
397 #
398 # The expression below matches something like this:
399 # test reg-33.8 {Bug 505048} {
400 # regexp -inline {\A\s*[^b]*b} ab
401 # } ab
402 #
403 # The three subexpressions then return these parts:
404 # $extras[$i] = '{Bug 505048}',
405 # $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
406 # $extras[$i + 2] = 'ab'
407 #
408 my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n # line 1
409 \s*regexp\s+([^\n]+)\n # line 2
410 \}\s*(\S[^\n]*)/gx; # line 3
411
412 handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
413
414 for (my $i = 0; $i < @extras; $i += 3) {
415 my $id = $extras[$i];
416
417 # further parse the middle line into options and the rest (i.e. $args)
418 my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
419
420 my @args = parsetcl $args;
421 $#args = 1; # only want the first two
422
423 # now handle the options
424 my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
425 my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
426
427 # get them all in the right order and print
428 unshift @args, $test, parsetcl($id), $results ? '-' : 'o';
429 push @args, parsetcl(parsetcl($results)) if $results;
430 handle_test @args;
431 }
432 }
433
434 # finish
435 #
436 handle_end;