Eliminate non-ascii chars from generated include file
[wxWidgets.git] / tests / regex / regex.pl
1 #!/usr/bin/env perl
2 #############################################################################
3 # Name: regex.pl
4 # Purpose: Generate test code for wxRegEx from 'reg.test'
5 # Author: Mike Wetherell
6 # RCS-ID: $Id$
7 # Copyright: (c) Mike Wetherell
8 # Licence: wxWidgets licence
9 #############################################################################
10
11 #
12 # Notes:
13 # See './regex.pl -h' for usage
14 #
15 # Output at the moment is C++ using the cppunit testing framework. The
16 # language/framework specifics are separated, with the following 5
17 # subs as an interface: 'begin_output', 'begin_section', 'write_test',
18 # 'end_section' and 'end_output'. So for a different language/framework,
19 # implement 5 new similar subs.
20 #
21 # I've avoided using 'use encoding "UTF-8"', since this wasn't available
22 # in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
23 # earler than perl 5.6.0 aren't going to work.
24 #
25
26 use strict;
27 use warnings;
28 use File::Basename;
29 #use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory
30
31 # if 0 output is wide characters, if 1 output is utf8 encoded
32 my $utf = 1;
33
34 # quote a parameter (C++ helper)
35 #
36 sub quotecxx {
37 my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
38 "\n" => "n", "\r" => "r", "\t" => "t",
39 "\013" => "v", '"' => '"', "\\" => "\\" );
40
41 # working around lack of 'use encoding'
42 if (!$utf) {
43 $_ = pack "U0C*", unpack "C*", $_;
44 use utf8;
45 }
46
47 s/[\000-\037"\\\177-\x{ffff}]/
48 if ($esc{$&}) {
49 "\\$esc{$&}";
50 } elsif (ord($&) > 0x9f && !$utf) {
51 sprintf "\\u%04x", ord($&);
52 } else {
53 sprintf "\\%03o", ord($&);
54 }
55 /ge;
56
57 # working around lack of 'use encoding'
58 if (!$utf) {
59 no utf8;
60 $_ = pack "C*", unpack "C*", $_;
61 }
62
63 return ($utf ? '"' : 'L"') . $_ . '"'
64 }
65
66 # start writing the output code (C++ interface)
67 #
68 sub begin_output {
69 my ($from, $instructions) = @_;
70
71 # embed it in the comment
72 $from = "\n$from";
73 $from =~ s/^(?: )?/ * /mg;
74
75 # $instructions contains information about the flags etc.
76 if ($instructions) {
77 $instructions = "\n$instructions";
78 $instructions =~ s/^(?: )?/ * /mg;
79 }
80
81 my $u = $utf ? " (UTF-8 encoded)" : "";
82
83 print <<EOT;
84 /*
85 * Test data for wxRegEx$u
86 $from$instructions */
87
88 EOT
89 }
90
91 my @classes;
92
93 # start a new section (C++ interface)
94 #
95 sub begin_section {
96 my ($id, $title) = @_;
97 my $class = "regextest_$id";
98 $class =~ s/\W/_/g;
99 push @classes, [$id, $class];
100
101 print <<EOT;
102
103 /*
104 * $id $title
105 */
106
107 class $class : public RegExTestSuite
108 {
109 public:
110 $class() : RegExTestSuite("regex.$id") { }
111 static Test *suite();
112 };
113
114 Test *$class\::suite()
115 {
116 RegExTestSuite *suite = new $class;
117
118 EOT
119 }
120
121 # output a test line (C++ interface)
122 #
123 sub write_test {
124 my @args = @_;
125 $_ = quotecxx for @args;
126 print " suite->add(" . (join ', ', @args) . ", NULL);\n";
127 }
128
129 # end a section (C++ interface)
130 #
131 sub end_section {
132 my ($id, $class) = @{$classes[$#classes]};
133
134 print <<EOT;
135
136 return suite;
137 }
138
139 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
140
141 EOT
142 }
143
144 # finish off the output (C++ interface)
145 #
146 sub end_output {
147 print <<EOT;
148
149 /*
150 * A suite containing all the above suites
151 */
152
153 class regextest : public TestSuite
154 {
155 public:
156 regextest() : TestSuite("regex") { }
157 static Test *suite();
158 };
159
160 Test *regextest::suite()
161 {
162 TestSuite *suite = new regextest;
163
164 EOT
165 print " suite->addTest(".$_->[1]."::suite());\n" for @classes;
166
167 print <<EOT;
168
169 return suite;
170 }
171
172 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
173 CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
174 EOT
175 }
176
177 # Parse a tcl string. Handles curly quoting and double quoting.
178 #
179 sub parsetcl {
180 my ($curly, $quote);
181 # recursively defined expression that can parse balanced braces
182 # warning: uses experimental features of perl, see perlop(1)
183 $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
184 $quote = qr/"(?:\\"|[^"])*"/;
185 my @tokens = shift =~ /($curly|$quote|\S+)/g;
186
187 # now remove braces/quotes and unescape any escapes
188 for (@tokens) {
189 if (s/^{(.*)}$/$1/) {
190 # for curly quoting, only unescape \{ and \}
191 s/\\([{}])/$1/g;
192 } else {
193 s/^"(.*)"$/$1/;
194
195 # unescape any escapes
196 my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
197 "n" => "\n", "r" => "\r", "t" => "\t",
198 "v" => "\013" );
199 my $x = qr/[[:xdigit:]]/;
200
201 s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
202 if ($1 =~ m{^([0-7]+)}) {
203 chr(oct($1));
204 } elsif ($1 =~ m{^x($x+)}) {
205 pack("C0U", hex($1) & 0xff);
206 } elsif ($1 =~ m{^u($x+)}) {
207 pack("C0U", hex($1));
208 } elsif ($esc{$1}) {
209 $esc{$1};
210 } else {
211 $1;
212 }
213 /ge;
214 }
215 }
216
217 return @tokens;
218 }
219
220 # helpers which keep track of whether begin_section has been called, so that
221 # end_section can be called when appropriate
222 #
223 my @doing = ("0", "");
224 my $in_section = 0;
225
226 sub handle_doing {
227 end_section if $in_section;
228 $in_section = 0;
229 @doing = @_;
230 }
231
232 sub handle_test {
233 begin_section(@doing) if !$in_section;
234 $in_section = 1;
235 write_test @_;
236 }
237
238 sub handle_end {
239 end_section if $in_section;
240 $in_section = 0;
241 end_output;
242 }
243
244 # 'main' - start by parsing the command lines options.
245 #
246 my $badoption = !@ARGV;
247 my $utfdefault = $utf;
248 my $outputname;
249
250 for (my $i = 0; $i < @ARGV; ) {
251 if ($ARGV[$i] !~ m{^-.}) {
252 $i++;
253 next;
254 }
255
256 if ($ARGV[$i] eq '--') {
257 splice @ARGV, $i, 1;
258 last;
259 }
260
261 if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) { # -o : output file
262 $outputname = $2 || splice @ARGV, $i + 1, 1;
263 }
264
265 for (split //, substr($ARGV[$i], 1)) {
266 if (/u/i) { # -u : utf-8 output
267 $utf = 1;
268 } elsif (/w/i) { # -w : wide char output
269 $utf = 0;
270 } else {
271 $badoption = 1;
272 }
273 }
274
275 splice @ARGV, $i, 1;
276 }
277
278 # Display help
279 #
280 if ($badoption) {
281 my $prog = basename $0;
282 my ($w, $u) = (" (default)", " ");
283 ($w, $u) = ($u, $w) if $utfdefault;
284
285 print <<EOT;
286 Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
287 Generate test code for wxRegEx from 'reg.test'
288 Example: $prog -o regex.inc reg.test wxreg.test
289
290 -w$w Output will be wide characters.
291 -u$u Output will be UTF-8 encoded.
292
293 Input files should be in UTF-8. If no input files are specified input is
294 read from stdin. If no output file is specified output is written to stdout.
295 See the comments in reg.test for details of the input file format.
296 EOT
297 exit 0;
298 }
299
300 # Open the output file
301 #
302 open STDOUT, ">$outputname" if $outputname;
303
304 # Read in the files and initially parse just the comments for copyright
305 # information and instructions on the tests
306 #
307 my @input; # slurped input files stripped of comments
308 my $files = ""; # copyright info from the input comments
309 my $instructions = ""; # test instructions from the input comments
310
311 do {
312 my $inputname = basename $ARGV[0] if @ARGV;
313
314 # slurp input
315 undef $/;
316 my $in = <>;
317
318 # remove escaped newlines
319 $in =~ s/(?<!\\)\\\n//g;
320
321 # record the copyrights of the input files
322 for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
323 s/[\s:]+/ /g;
324 $files .= " ";
325 $files .= $inputname . ": " if $inputname && $inputname ne '-';
326 $files .= "$_\n";
327 }
328
329 # Parse the comments for instructions on the tests, which look like this:
330 # i successful match with -indices (used in checking things like
331 # nonparticipating subexpressions)
332 if (!$instructions) {
333 my $sp = qr{\t| +}; # tab or three or more spaces
334 my @instructions = $in =~
335 /\n(
336 (?:
337 \#$sp\S?$sp\S[^\n]+\n # instruction line
338 (?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any)
339 )+
340 )/gx;
341
342 if (@instructions) {
343 $instructions[0] = "Test types:\n$instructions[0]";
344 if (@instructions > 1) {
345 $instructions[1] = "Flag characters:\n$instructions[1]";
346 }
347 $instructions = join "\n", @instructions;
348 $instructions =~ s/^#([^\t]?)/ $1/mg;
349 }
350 }
351
352 # @input is the input of all files (stipped of comments)
353 $in =~ s/^#.*$//mg;
354 push @input, $in;
355
356 } while $ARGV[0];
357
358 # Make a string naming the generator, the input files and copyright info
359 #
360 my $from = "Generated " . localtime() . " by " . basename $0;
361 $from =~ s/[\s]+/ /g;
362 if ($files) {
363 if ($files =~ /:/) {
364 $from .= " from the following files:";
365 } else {
366 $from .= " from work with the following copyright:";
367 }
368 }
369 $from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g); # word-wrap
370 $from .= "\n$files" if $files;
371
372 # Now start to print the code
373 #
374 begin_output $from, $instructions;
375
376 # numbers for 'extra' sections
377 my $extra = 1;
378
379 for (@input)
380 {
381 # Print the main tests
382 #
383 # Test lines look like this:
384 # m 3 b {\(a\)b} ab ab a
385 #
386 # Also looks for heading lines, e.g.:
387 # doing 4 "parentheses"
388 #
389 for (split "\n") {
390 if (/^doing\s+(\S+)\s+(\S.*)/) {
391 handle_doing parsetcl "$1 $2";
392 } elsif (/^[efimp]\s/) {
393 handle_test parsetcl $_;
394 }
395 }
396
397 # Extra tests
398 #
399 # The expression below matches something like this:
400 # test reg-33.8 {Bug 505048} {
401 # regexp -inline {\A\s*[^b]*b} ab
402 # } ab
403 #
404 # The three subexpressions then return these parts:
405 # $extras[$i] = '{Bug 505048}',
406 # $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
407 # $extras[$i + 2] = 'ab'
408 #
409 my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n # line 1
410 \s*regexp\s+([^\n]+)\n # line 2
411 \}\s*(\S[^\n]*)/gx; # line 3
412
413 handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
414
415 for (my $i = 0; $i < @extras; $i += 3) {
416 my $id = $extras[$i];
417
418 # further parse the middle line into options and the rest (i.e. $args)
419 my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
420
421 my @args = parsetcl $args;
422 $#args = 1; # only want the first two
423
424 # now handle the options
425 my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
426 my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
427
428 # get them all in the right order and print
429 unshift @args, $test, parsetcl($id), $results ? '-' : 'o';
430 push @args, parsetcl(parsetcl($results)) if $results;
431 handle_test @args;
432 }
433 }
434
435 # finish
436 #
437 handle_end;