Border correction
[wxWidgets.git] / tests / regex / regex.pl
CommitLineData
d7204bba 1#!/usr/bin/env perl
e70833fb
VS
2#############################################################################
3# Name: regex.pl
4# Purpose: Generate test code for wxRegEx from 'reg.test'
5# Author: Mike Wetherell
6# RCS-ID: $Id$
7# Copyright: (c) Mike Wetherell
8# Licence: wxWidgets licence
9#############################################################################
10
11#
12# Notes:
13# See './regex.pl -h' for usage
14#
15# Output at the moment is C++ using the cppunit testing framework. The
16# language/framework specifics are separated, with the following 5
17# subs as an interface: 'begin_output', 'begin_section', 'write_test',
18# 'end_section' and 'end_output'. So for a different language/framework,
19# implement 5 new similar subs.
20#
21# I've avoided using 'use encoding "UTF-8"', since this wasn't available
22# in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
23# earler than perl 5.6.0 aren't going to work.
24#
25
26use strict;
d7204bba 27use warnings;
e70833fb
VS
28use File::Basename;
29#use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory
30
31# if 0 output is wide characters, if 1 output is utf8 encoded
32my $utf = 1;
33
34# quote a parameter (C++ helper)
35#
36sub quotecxx {
37 my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
38 "\n" => "n", "\r" => "r", "\t" => "t",
39 "\013" => "v", '"' => '"', "\\" => "\\" );
40
41 # working around lack of 'use encoding'
a7ea35b0
MW
42 if (!$utf) {
43 $_ = pack "U0C*", unpack "C*", $_;
44 use utf8;
45 }
e70833fb
VS
46
47 s/[\000-\037"\\\177-\x{ffff}]/
48 if ($esc{$&}) {
49 "\\$esc{$&}";
a7ea35b0
MW
50 } elsif (ord($&) > 0x9f && !$utf) {
51 sprintf "\\u%04x", ord($&);
e70833fb
VS
52 } else {
53 sprintf "\\%03o", ord($&);
54 }
55 /ge;
56
57 # working around lack of 'use encoding'
a7ea35b0
MW
58 if (!$utf) {
59 no utf8;
60 $_ = pack "C*", unpack "C*", $_;
61 }
e70833fb
VS
62
63 return ($utf ? '"' : 'L"') . $_ . '"'
64}
65
66# start writing the output code (C++ interface)
67#
68sub begin_output {
69 my ($from, $instructions) = @_;
70
71 # embed it in the comment
72 $from = "\n$from";
73 $from =~ s/^(?: )?/ * /mg;
74
75 # $instructions contains information about the flags etc.
76 if ($instructions) {
77 $instructions = "\n$instructions";
78 $instructions =~ s/^(?: )?/ * /mg;
79 }
80
81 my $u = $utf ? " (UTF-8 encoded)" : "";
82
83 print <<EOT;
84/*
85 * Test data for wxRegEx$u
86$from$instructions */
87
88EOT
89}
90
91my @classes;
92
93# start a new section (C++ interface)
94#
95sub begin_section {
96 my ($id, $title) = @_;
97 my $class = "regextest_$id";
98 $class =~ s/\W/_/g;
99 push @classes, [$id, $class];
100
101 print <<EOT;
102
103/*
104 * $id $title
105 */
106
107class $class : public RegExTestSuite
108{
109public:
110 $class() : RegExTestSuite("regex.$id") { }
111 static Test *suite();
112};
113
114Test *$class\::suite()
115{
116 RegExTestSuite *suite = new $class;
117
118EOT
119}
120
121# output a test line (C++ interface)
122#
123sub write_test {
124 my @args = @_;
125 $_ = quotecxx for @args;
126 print " suite->add(" . (join ', ', @args) . ", NULL);\n";
127}
128
129# end a section (C++ interface)
130#
131sub end_section {
132 my ($id, $class) = @{$classes[$#classes]};
133
134 print <<EOT;
135
136 return suite;
137}
138
139CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
140
141EOT
142}
143
144# finish off the output (C++ interface)
145#
146sub end_output {
147 print <<EOT;
148
149/*
150 * A suite containing all the above suites
151 */
152
153class regextest : public TestSuite
154{
155public:
156 regextest() : TestSuite("regex") { }
157 static Test *suite();
158};
159
160Test *regextest::suite()
161{
162 TestSuite *suite = new regextest;
163
164EOT
165 print " suite->addTest(".$_->[1]."::suite());\n" for @classes;
166
167 print <<EOT;
168
169 return suite;
170}
171
172CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
173CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
174EOT
175}
176
177# Parse a tcl string. Handles curly quoting and double quoting.
178#
179sub parsetcl {
180 my ($curly, $quote);
181 # recursively defined expression that can parse balanced braces
182 # warning: uses experimental features of perl, see perlop(1)
183 $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
184 $quote = qr/"(?:\\"|[^"])*"/;
185 my @tokens = shift =~ /($curly|$quote|\S+)/g;
186
187 # now remove braces/quotes and unescape any escapes
188 for (@tokens) {
189 if (s/^{(.*)}$/$1/) {
190 # for curly quoting, only unescape \{ and \}
191 s/\\([{}])/$1/g;
192 } else {
193 s/^"(.*)"$/$1/;
194
195 # unescape any escapes
196 my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
197 "n" => "\n", "r" => "\r", "t" => "\t",
198 "v" => "\013" );
199 my $x = qr/[[:xdigit:]]/;
200
201 s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
202 if ($1 =~ m{^([0-7]+)}) {
203 chr(oct($1));
204 } elsif ($1 =~ m{^x($x+)}) {
205 pack("C0U", hex($1) & 0xff);
206 } elsif ($1 =~ m{^u($x+)}) {
207 pack("C0U", hex($1));
208 } elsif ($esc{$1}) {
209 $esc{$1};
210 } else {
211 $1;
212 }
213 /ge;
214 }
215 }
216
217 return @tokens;
218}
219
220# helpers which keep track of whether begin_section has been called, so that
221# end_section can be called when appropriate
222#
223my @doing = ("0", "");
224my $in_section = 0;
225
226sub handle_doing {
227 end_section if $in_section;
228 $in_section = 0;
229 @doing = @_;
230}
231
232sub handle_test {
233 begin_section(@doing) if !$in_section;
234 $in_section = 1;
235 write_test @_;
236}
237
238sub handle_end {
239 end_section if $in_section;
240 $in_section = 0;
241 end_output;
242}
243
244# 'main' - start by parsing the command lines options.
245#
246my $badoption = !@ARGV;
247my $utfdefault = $utf;
248my $outputname;
249
250for (my $i = 0; $i < @ARGV; ) {
251 if ($ARGV[$i] !~ m{^-.}) {
252 $i++;
253 next;
254 }
255
256 if ($ARGV[$i] eq '--') {
257 splice @ARGV, $i, 1;
258 last;
259 }
260
261 if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) { # -o : output file
262 $outputname = $2 || splice @ARGV, $i + 1, 1;
263 }
264
265 for (split //, substr($ARGV[$i], 1)) {
266 if (/u/i) { # -u : utf-8 output
267 $utf = 1;
268 } elsif (/w/i) { # -w : wide char output
269 $utf = 0;
270 } else {
271 $badoption = 1;
272 }
273 }
274
275 splice @ARGV, $i, 1;
276}
277
278# Display help
279#
280if ($badoption) {
281 my $prog = basename $0;
282 my ($w, $u) = (" (default)", " ");
283 ($w, $u) = ($u, $w) if $utfdefault;
284
285 print <<EOT;
286Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
287Generate test code for wxRegEx from 'reg.test'
288Example: $prog -o regex.inc reg.test wxreg.test
289
290 -w$w Output will be wide characters.
291 -u$u Output will be UTF-8 encoded.
292
293Input files should be in UTF-8. If no input files are specified input is
294read from stdin. If no output file is specified output is written to stdout.
d7204bba 295See the comments in reg.test for details of the input file format.
e70833fb
VS
296EOT
297 exit 0;
298}
299
300# Open the output file
301#
302open STDOUT, ">$outputname" if $outputname;
303
304# Read in the files and initially parse just the comments for copyright
305# information and instructions on the tests
306#
307my @input; # slurped input files stripped of comments
308my $files = ""; # copyright info from the input comments
309my $instructions = ""; # test instructions from the input comments
310
311do {
312 my $inputname = basename $ARGV[0] if @ARGV;
313
314 # slurp input
315 undef $/;
316 my $in = <>;
317
318 # remove escaped newlines
319 $in =~ s/(?<!\\)\\\n//g;
320
321 # record the copyrights of the input files
322 for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
323 s/[\s:]+/ /g;
324 $files .= " ";
325 $files .= $inputname . ": " if $inputname && $inputname ne '-';
326 $files .= "$_\n";
327 }
328
329 # Parse the comments for instructions on the tests, which look like this:
330 # i successful match with -indices (used in checking things like
331 # nonparticipating subexpressions)
332 if (!$instructions) {
333 my $sp = qr{\t| +}; # tab or three or more spaces
334 my @instructions = $in =~
335 /\n(
336 (?:
337 \#$sp\S?$sp\S[^\n]+\n # instruction line
338 (?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any)
339 )+
340 )/gx;
341
342 if (@instructions) {
343 $instructions[0] = "Test types:\n$instructions[0]";
344 if (@instructions > 1) {
345 $instructions[1] = "Flag characters:\n$instructions[1]";
346 }
347 $instructions = join "\n", @instructions;
348 $instructions =~ s/^#([^\t]?)/ $1/mg;
349 }
350 }
351
352 # @input is the input of all files (stipped of comments)
353 $in =~ s/^#.*$//mg;
354 push @input, $in;
355
356} while $ARGV[0];
357
358# Make a string naming the generator, the input files and copyright info
359#
360my $from = "Generated " . localtime() . " by " . basename $0;
361$from =~ s/[\s]+/ /g;
362if ($files) {
363 if ($files =~ /:/) {
364 $from .= " from the following files:";
365 } else {
366 $from .= " from work with the following copyright:";
367 }
368}
369$from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g); # word-wrap
370$from .= "\n$files" if $files;
371
372# Now start to print the code
373#
374begin_output $from, $instructions;
375
376# numbers for 'extra' sections
377my $extra = 1;
378
379for (@input)
380{
381 # Print the main tests
382 #
383 # Test lines look like this:
384 # m 3 b {\(a\)b} ab ab a
385 #
386 # Also looks for heading lines, e.g.:
387 # doing 4 "parentheses"
388 #
389 for (split "\n") {
390 if (/^doing\s+(\S+)\s+(\S.*)/) {
391 handle_doing parsetcl "$1 $2";
392 } elsif (/^[efimp]\s/) {
393 handle_test parsetcl $_;
394 }
395 }
396
397 # Extra tests
398 #
399 # The expression below matches something like this:
400 # test reg-33.8 {Bug 505048} {
401 # regexp -inline {\A\s*[^b]*b} ab
402 # } ab
403 #
404 # The three subexpressions then return these parts:
405 # $extras[$i] = '{Bug 505048}',
406 # $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
407 # $extras[$i + 2] = 'ab'
408 #
409 my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n # line 1
410 \s*regexp\s+([^\n]+)\n # line 2
411 \}\s*(\S[^\n]*)/gx; # line 3
412
413 handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
414
415 for (my $i = 0; $i < @extras; $i += 3) {
416 my $id = $extras[$i];
417
418 # further parse the middle line into options and the rest (i.e. $args)
419 my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
420
421 my @args = parsetcl $args;
422 $#args = 1; # only want the first two
423
424 # now handle the options
425 my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
426 my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
427
428 # get them all in the right order and print
bc10103e 429 unshift @args, $test, parsetcl($id), $results ? '-' : 'o';
e70833fb
VS
430 push @args, parsetcl(parsetcl($results)) if $results;
431 handle_test @args;
432 }
433}
434
435# finish
436#
437handle_end;