added regex test suite
[wxWidgets.git] / tests / regex / regex.pl
1 #!/usr/bin/env perl -w
2 #############################################################################
3 # Name: regex.pl
4 # Purpose: Generate test code for wxRegEx from 'reg.test'
5 # Author: Mike Wetherell
6 # RCS-ID: $Id$
7 # Copyright: (c) Mike Wetherell
8 # Licence: wxWidgets licence
9 #############################################################################
10
11 #
12 # Notes:
13 # See './regex.pl -h' for usage
14 #
15 # Output at the moment is C++ using the cppunit testing framework. The
16 # language/framework specifics are separated, with the following 5
17 # subs as an interface: 'begin_output', 'begin_section', 'write_test',
18 # 'end_section' and 'end_output'. So for a different language/framework,
19 # implement 5 new similar subs.
20 #
21 # I've avoided using 'use encoding "UTF-8"', since this wasn't available
22 # in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
23 # earler than perl 5.6.0 aren't going to work.
24 #
25
26 use strict;
27 use File::Basename;
28 #use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory
29
30 # if 0 output is wide characters, if 1 output is utf8 encoded
31 my $utf = 1;
32
33 # quote a parameter (C++ helper)
34 #
35 sub quotecxx {
36 my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
37 "\n" => "n", "\r" => "r", "\t" => "t",
38 "\013" => "v", '"' => '"', "\\" => "\\" );
39
40 # working around lack of 'use encoding'
41 $_ = pack "U0C*", unpack "C*", $_;
42 use utf8;
43
44 s/[\000-\037"\\\177-\x{ffff}]/
45 if ($esc{$&}) {
46 "\\$esc{$&}";
47 } elsif (ord($&) > 0x9f) {
48 if ($utf) {
49 $&;
50 } else {
51 sprintf "\\u%04x", ord($&);
52 }
53 } else {
54 sprintf "\\%03o", ord($&);
55 }
56 /ge;
57
58 # working around lack of 'use encoding'
59 no utf8;
60 $_ = pack "C*", unpack "C*", $_;
61
62 return ($utf ? '"' : 'L"') . $_ . '"'
63 }
64
65 # start writing the output code (C++ interface)
66 #
67 sub begin_output {
68 my ($from, $instructions) = @_;
69
70 # embed it in the comment
71 $from = "\n$from";
72 $from =~ s/^(?: )?/ * /mg;
73
74 # $instructions contains information about the flags etc.
75 if ($instructions) {
76 $instructions = "\n$instructions";
77 $instructions =~ s/^(?: )?/ * /mg;
78 }
79
80 my $u = $utf ? " (UTF-8 encoded)" : "";
81
82 print <<EOT;
83 /*
84 * Test data for wxRegEx$u
85 $from$instructions */
86
87 EOT
88 }
89
90 my @classes;
91
92 # start a new section (C++ interface)
93 #
94 sub begin_section {
95 my ($id, $title) = @_;
96 my $class = "regextest_$id";
97 $class =~ s/\W/_/g;
98 push @classes, [$id, $class];
99
100 print <<EOT;
101
102 /*
103 * $id $title
104 */
105
106 class $class : public RegExTestSuite
107 {
108 public:
109 $class() : RegExTestSuite("regex.$id") { }
110 static Test *suite();
111 };
112
113 Test *$class\::suite()
114 {
115 RegExTestSuite *suite = new $class;
116
117 EOT
118 }
119
120 # output a test line (C++ interface)
121 #
122 sub write_test {
123 my @args = @_;
124 $_ = quotecxx for @args;
125 print " suite->add(" . (join ', ', @args) . ", NULL);\n";
126 }
127
128 # end a section (C++ interface)
129 #
130 sub end_section {
131 my ($id, $class) = @{$classes[$#classes]};
132
133 print <<EOT;
134
135 return suite;
136 }
137
138 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
139
140 EOT
141 }
142
143 # finish off the output (C++ interface)
144 #
145 sub end_output {
146 print <<EOT;
147
148 /*
149 * A suite containing all the above suites
150 */
151
152 class regextest : public TestSuite
153 {
154 public:
155 regextest() : TestSuite("regex") { }
156 static Test *suite();
157 };
158
159 Test *regextest::suite()
160 {
161 TestSuite *suite = new regextest;
162
163 EOT
164 print " suite->addTest(".$_->[1]."::suite());\n" for @classes;
165
166 print <<EOT;
167
168 return suite;
169 }
170
171 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
172 CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
173 EOT
174 }
175
176 # Parse a tcl string. Handles curly quoting and double quoting.
177 #
178 sub parsetcl {
179 my ($curly, $quote);
180 # recursively defined expression that can parse balanced braces
181 # warning: uses experimental features of perl, see perlop(1)
182 $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
183 $quote = qr/"(?:\\"|[^"])*"/;
184 my @tokens = shift =~ /($curly|$quote|\S+)/g;
185
186 # now remove braces/quotes and unescape any escapes
187 for (@tokens) {
188 if (s/^{(.*)}$/$1/) {
189 # for curly quoting, only unescape \{ and \}
190 s/\\([{}])/$1/g;
191 } else {
192 s/^"(.*)"$/$1/;
193
194 # unescape any escapes
195 my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
196 "n" => "\n", "r" => "\r", "t" => "\t",
197 "v" => "\013" );
198 my $x = qr/[[:xdigit:]]/;
199
200 s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
201 if ($1 =~ m{^([0-7]+)}) {
202 chr(oct($1));
203 } elsif ($1 =~ m{^x($x+)}) {
204 pack("C0U", hex($1) & 0xff);
205 } elsif ($1 =~ m{^u($x+)}) {
206 pack("C0U", hex($1));
207 } elsif ($esc{$1}) {
208 $esc{$1};
209 } else {
210 $1;
211 }
212 /ge;
213 }
214 }
215
216 return @tokens;
217 }
218
219 # helpers which keep track of whether begin_section has been called, so that
220 # end_section can be called when appropriate
221 #
222 my @doing = ("0", "");
223 my $in_section = 0;
224
225 sub handle_doing {
226 end_section if $in_section;
227 $in_section = 0;
228 @doing = @_;
229 }
230
231 sub handle_test {
232 begin_section(@doing) if !$in_section;
233 $in_section = 1;
234 write_test @_;
235 }
236
237 sub handle_end {
238 end_section if $in_section;
239 $in_section = 0;
240 end_output;
241 }
242
243 # 'main' - start by parsing the command lines options.
244 #
245 my $badoption = !@ARGV;
246 my $utfdefault = $utf;
247 my $outputname;
248
249 for (my $i = 0; $i < @ARGV; ) {
250 if ($ARGV[$i] !~ m{^-.}) {
251 $i++;
252 next;
253 }
254
255 if ($ARGV[$i] eq '--') {
256 splice @ARGV, $i, 1;
257 last;
258 }
259
260 if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) { # -o : output file
261 $outputname = $2 || splice @ARGV, $i + 1, 1;
262 }
263
264 for (split //, substr($ARGV[$i], 1)) {
265 if (/u/i) { # -u : utf-8 output
266 $utf = 1;
267 } elsif (/w/i) { # -w : wide char output
268 $utf = 0;
269 } else {
270 $badoption = 1;
271 }
272 }
273
274 splice @ARGV, $i, 1;
275 }
276
277 # Display help
278 #
279 if ($badoption) {
280 my $prog = basename $0;
281 my ($w, $u) = (" (default)", " ");
282 ($w, $u) = ($u, $w) if $utfdefault;
283
284 print <<EOT;
285 Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
286 Generate test code for wxRegEx from 'reg.test'
287 Example: $prog -o regex.inc reg.test wxreg.test
288
289 -w$w Output will be wide characters.
290 -u$u Output will be UTF-8 encoded.
291
292 Input files should be in UTF-8. If no input files are specified input is
293 read from stdin. If no output file is specified output is written to stdout.
294 See the comments in reg.test (in src/regex) for details of the input file
295 format.
296 EOT
297 exit 0;
298 }
299
300 # Open the output file
301 #
302 open STDOUT, ">$outputname" if $outputname;
303
304 # Read in the files and initially parse just the comments for copyright
305 # information and instructions on the tests
306 #
307 my @input; # slurped input files stripped of comments
308 my $files = ""; # copyright info from the input comments
309 my $instructions = ""; # test instructions from the input comments
310
311 do {
312 my $inputname = basename $ARGV[0] if @ARGV;
313
314 # slurp input
315 undef $/;
316 my $in = <>;
317
318 # remove escaped newlines
319 $in =~ s/(?<!\\)\\\n//g;
320
321 # record the copyrights of the input files
322 for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
323 s/[\s:]+/ /g;
324 $files .= " ";
325 $files .= $inputname . ": " if $inputname && $inputname ne '-';
326 $files .= "$_\n";
327 }
328
329 # Parse the comments for instructions on the tests, which look like this:
330 # i successful match with -indices (used in checking things like
331 # nonparticipating subexpressions)
332 if (!$instructions) {
333 my $sp = qr{\t| +}; # tab or three or more spaces
334 my @instructions = $in =~
335 /\n(
336 (?:
337 \#$sp\S?$sp\S[^\n]+\n # instruction line
338 (?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any)
339 )+
340 )/gx;
341
342 if (@instructions) {
343 $instructions[0] = "Test types:\n$instructions[0]";
344 if (@instructions > 1) {
345 $instructions[1] = "Flag characters:\n$instructions[1]";
346 }
347 $instructions = join "\n", @instructions;
348 $instructions =~ s/^#([^\t]?)/ $1/mg;
349 }
350 }
351
352 # @input is the input of all files (stipped of comments)
353 $in =~ s/^#.*$//mg;
354 push @input, $in;
355
356 } while $ARGV[0];
357
358 # Make a string naming the generator, the input files and copyright info
359 #
360 my $from = "Generated " . localtime() . " by " . basename $0;
361 $from =~ s/[\s]+/ /g;
362 if ($files) {
363 if ($files =~ /:/) {
364 $from .= " from the following files:";
365 } else {
366 $from .= " from work with the following copyright:";
367 }
368 }
369 $from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g); # word-wrap
370 $from .= "\n$files" if $files;
371
372 # Now start to print the code
373 #
374 begin_output $from, $instructions;
375
376 # numbers for 'extra' sections
377 my $extra = 1;
378
379 for (@input)
380 {
381 # Print the main tests
382 #
383 # Test lines look like this:
384 # m 3 b {\(a\)b} ab ab a
385 #
386 # Also looks for heading lines, e.g.:
387 # doing 4 "parentheses"
388 #
389 for (split "\n") {
390 if (/^doing\s+(\S+)\s+(\S.*)/) {
391 handle_doing parsetcl "$1 $2";
392 } elsif (/^[efimp]\s/) {
393 handle_test parsetcl $_;
394 }
395 }
396
397 # Extra tests
398 #
399 # The expression below matches something like this:
400 # test reg-33.8 {Bug 505048} {
401 # regexp -inline {\A\s*[^b]*b} ab
402 # } ab
403 #
404 # The three subexpressions then return these parts:
405 # $extras[$i] = '{Bug 505048}',
406 # $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
407 # $extras[$i + 2] = 'ab'
408 #
409 my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n # line 1
410 \s*regexp\s+([^\n]+)\n # line 2
411 \}\s*(\S[^\n]*)/gx; # line 3
412
413 handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
414
415 for (my $i = 0; $i < @extras; $i += 3) {
416 my $id = $extras[$i];
417
418 # further parse the middle line into options and the rest (i.e. $args)
419 my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
420
421 my @args = parsetcl $args;
422 $#args = 1; # only want the first two
423
424 # now handle the options
425 my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
426 my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
427
428 # get them all in the right order and print
429 unshift @args, $test, parsetcl($id), '-';
430 push @args, parsetcl(parsetcl($results)) if $results;
431 handle_test @args;
432 }
433 }
434
435 # finish
436 #
437 handle_end;