tests/regex/regex.pl

   1 #!/usr/bin/env perl
   2 #############################################################################
   3 # Name:        regex.pl
   4 # Purpose:     Generate test code for wxRegEx from 'reg.test'
   5 # Author:      Mike Wetherell
   6 # Copyright:   (c) Mike Wetherell
   7 # Licence:     wxWindows licence
   8 #############################################################################
   9
  10 #
  11 # Notes:
  12 #   See './regex.pl -h' for usage
  13 #
  14 #   Output at the moment is C++ using the cppunit testing framework. The
  15 #   language/framework specifics are separated, with the following 5
  16 #   subs as an interface: 'begin_output', 'begin_section', 'write_test',
  17 #   'end_section' and 'end_output'. So for a different language/framework,
  18 #   implement 5 new similar subs.
  19 #
  20 #   I've avoided using 'use encoding "UTF-8"', since this wasn't available
  21 #   in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
  22 #   earler than perl 5.6.0 aren't going to work.
  23 #
  24
  25 use strict;
  26 use warnings;
  27 use File::Basename;
  28 #use encoding "UTF-8";  # enable in the future when perl 5.6.x is just a memory
  29
  30 # if 0 output is wide characters, if 1 output is utf8 encoded
  31 my $utf = 1;
  32
  33 # quote a parameter (C++ helper)
  34 #
  35 sub quotecxx {
  36     my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
  37                 "\n" => "n", "\r" => "r", "\t" => "t",
  38                 "\013" => "v", '"' => '"', "\\" => "\\" );
  39
  40     # working around lack of 'use encoding'
  41     if (!$utf) {
  42         $_ = pack "U0C*", unpack "C*", $_;
  43         use utf8;
  44     }
  45
  46     s/[\000-\037"\\\177-\x{ffff}]/
  47         if ($esc{$&}) {
  48             "\\$esc{$&}";
  49         } elsif (ord($&) > 0x9f && !$utf) {
  50             sprintf "\\u%04x", ord($&);
  51         } else {
  52             sprintf "\\%03o", ord($&);
  53         }
  54     /ge;
  55
  56     # working around lack of 'use encoding'
  57     if (!$utf) {
  58         no utf8;
  59         $_ = pack "C*", unpack "C*", $_;
  60     }
  61
  62     return ($utf ? '"' : 'L"') . $_ . '"'
  63 }
  64
  65 # start writing the output code (C++ interface)
  66 #
  67 sub begin_output {
  68     my ($from, $instructions) = @_;
  69
  70     # embed it in the comment
  71     $from = "\n$from";
  72     $from =~ s/^(?:   )?/ * /mg;
  73
  74     # $instructions contains information about the flags etc.
  75     if ($instructions) {
  76         $instructions = "\n$instructions";
  77         $instructions =~ s/^(?:   )?/ * /mg;
  78     }
  79
  80     my $u = $utf ? " (UTF-8 encoded)" : "";
  81
  82     print <<EOT;
  83 /*
  84  * Test data for wxRegEx$u
  85 $from$instructions */
  86
  87 EOT
  88 }
  89
  90 my @classes;
  91
  92 # start a new section (C++ interface)
  93 #
  94 sub begin_section {
  95     my ($id, $title) = @_;
  96     my $class = "regextest_$id";
  97     $class =~ s/\W/_/g;
  98     push @classes, [$id, $class];
  99
 100     print <<EOT;
 101
 102 /*
 103  * $id $title
 104  */
 105
 106 class $class : public RegExTestSuite
 107 {
 108 public:
 109     $class() : RegExTestSuite("regex.$id") { }
 110     static Test *suite();
 111 };
 112
 113 Test *$class\::suite()
 114 {
 115     RegExTestSuite *suite = new $class;
 116
 117 EOT
 118 }
 119
 120 # output a test line (C++ interface)
 121 #
 122 sub write_test {
 123     my @args = @_;
 124     $_ = quotecxx for @args;
 125     print "    suite->add(" . (join ', ', @args) . ", NULL);\n";
 126 }
 127
 128 # end a section (C++ interface)
 129 #
 130 sub end_section {
 131     my ($id, $class) = @{$classes[$#classes]};
 132
 133     print <<EOT;
 134
 135     return suite;
 136 }
 137
 138 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
 139
 140 EOT
 141 }
 142
 143 # finish off the output (C++ interface)
 144 #
 145 sub end_output {
 146     print <<EOT;
 147
 148 /*
 149  * A suite containing all the above suites
 150  */
 151
 152 class regextest : public TestSuite
 153 {
 154 public:
 155     regextest() : TestSuite("regex") { }
 156     static Test *suite();
 157 };
 158
 159 Test *regextest::suite()
 160 {
 161     TestSuite *suite = new regextest;
 162
 163 EOT
 164     print "    suite->addTest(".$_->[1]."::suite());\n" for @classes;
 165
 166     print <<EOT;
 167
 168     return suite;
 169 }
 170
 171 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
 172 CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
 173 EOT
 174 }
 175
 176 # Parse a tcl string. Handles curly quoting and double quoting.
 177 #
 178 sub parsetcl {
 179     my ($curly, $quote);
 180     # recursively defined expression that can parse balanced braces
 181     # warning: uses experimental features of perl, see perlop(1)
 182     $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
 183     $quote = qr/"(?:\\"|[^"])*"/;
 184     my @tokens = shift =~ /($curly|$quote|\S+)/g;
 185
 186     # now remove braces/quotes and unescape any escapes
 187     for (@tokens) {
 188         if (s/^{(.*)}$/$1/) {
 189             # for curly quoting, only unescape \{ and \}
 190             s/\\([{}])/$1/g;
 191         } else {
 192             s/^"(.*)"$/$1/;
 193
 194             # unescape any escapes
 195             my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
 196                         "n" => "\n", "r" => "\r", "t" => "\t",
 197                         "v" => "\013" );
 198             my $x = qr/[[:xdigit:]]/;
 199
 200             s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
 201                 if ($1 =~ m{^([0-7]+)}) {
 202                     chr(oct($1));
 203                 } elsif ($1 =~ m{^x($x+)}) {
 204                     pack("C0U", hex($1) & 0xff);
 205                 } elsif ($1 =~ m{^u($x+)}) {
 206                     pack("C0U", hex($1));
 207                 } elsif ($esc{$1}) {
 208                     $esc{$1};
 209                 } else {
 210                     $1;
 211                 }
 212             /ge;
 213         }
 214     }
 215
 216     return @tokens;
 217 }
 218
 219 # helpers which keep track of whether begin_section has been called, so that
 220 # end_section can be called when appropriate
 221 #
 222 my @doing = ("0", "");
 223 my $in_section = 0;
 224
 225 sub handle_doing {
 226     end_section if $in_section;
 227     $in_section = 0;
 228     @doing = @_;
 229 }
 230
 231 sub handle_test {
 232     begin_section(@doing) if !$in_section;
 233     $in_section = 1;
 234     write_test @_;
 235 }
 236
 237 sub handle_end {
 238     end_section if $in_section;
 239     $in_section = 0;
 240     end_output;
 241 }
 242
 243 # 'main' - start by parsing the command lines options.
 244 #
 245 my $badoption = !@ARGV;
 246 my $utfdefault = $utf;
 247 my $outputname;
 248
 249 for (my $i = 0; $i < @ARGV; ) {
 250     if ($ARGV[$i] !~ m{^-.}) {
 251         $i++;
 252         next;
 253     }
 254
 255     if ($ARGV[$i] eq '--') {
 256         splice @ARGV, $i, 1;
 257         last;
 258     }
 259
 260     if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) {       # -o : output file
 261         $outputname = $2 || splice @ARGV, $i + 1, 1;
 262     }
 263
 264     for (split //, substr($ARGV[$i], 1)) {
 265         if (/u/i) {                                 # -u : utf-8 output
 266             $utf = 1;
 267         } elsif (/w/i) {                            # -w : wide char output
 268             $utf = 0;
 269         } else {
 270             $badoption = 1;
 271         }
 272     }
 273
 274     splice @ARGV, $i, 1;
 275 }
 276
 277 # Display help
 278 #
 279 if ($badoption) {
 280     my $prog = basename $0;
 281     my ($w, $u) = (" (default)", "          ");
 282     ($w, $u) = ($u, $w) if $utfdefault;
 283
 284     print <<EOT;
 285 Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
 286 Generate test code for wxRegEx from 'reg.test'
 287 Example: $prog -o regex.inc reg.test wxreg.test
 288
 289  -w$w   Output will be wide characters.
 290  -u$u   Output will be UTF-8 encoded.
 291
 292 Input files should be in UTF-8. If no input files are specified input is
 293 read from stdin. If no output file is specified output is written to stdout.
 294 See the comments in reg.test for details of the input file format.
 295 EOT
 296     exit 0;
 297 }
 298
 299 # Open the output file
 300 #
 301 open STDOUT, ">$outputname" if $outputname;
 302
 303 # Read in the files and initially parse just the comments for copyright
 304 # information and instructions on the tests
 305 #
 306 my @input;                  # slurped input files stripped of comments
 307 my $files = "";             # copyright info from the input comments
 308 my $instructions = "";      # test instructions from the input comments
 309
 310 do {
 311     my $inputname = basename $ARGV[0] if @ARGV;
 312
 313     # slurp input
 314     undef $/;
 315     my $in = <>;
 316
 317     # remove escaped newlines
 318     $in =~ s/(?<!\\)\\\n//g;
 319
 320     # record the copyrights of the input files
 321     for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
 322         s/[\s:]+/ /g;
 323         $files .= "  ";
 324         $files .= $inputname . ": " if $inputname && $inputname ne '-';
 325         $files .= "$_\n";
 326     }
 327
 328     # Parse the comments for instructions on the tests, which look like this:
 329     #    i    successful match with -indices (used in checking things like
 330     #         nonparticipating subexpressions)
 331     if (!$instructions) {
 332         my $sp = qr{\t|   +};                   # tab or three or more spaces
 333         my @instructions = $in =~
 334             /\n(
 335                 (?:
 336                     \#$sp\S?$sp\S[^\n]+\n       # instruction line
 337                     (?:\#$sp$sp\S[^\n]+\n)*     # continuation lines (if any)
 338                 )+
 339             )/gx;
 340
 341         if (@instructions) {
 342             $instructions[0] = "Test types:\n$instructions[0]";
 343             if (@instructions > 1) {
 344                 $instructions[1] = "Flag characters:\n$instructions[1]";
 345             }
 346             $instructions = join "\n", @instructions;
 347             $instructions =~ s/^#([^\t]?)/ $1/mg;
 348         }
 349     }
 350
 351     # @input is the input of all files (stipped of comments)
 352     $in =~ s/^#.*$//mg;
 353     push @input, $in;
 354
 355 } while $ARGV[0];
 356
 357 # Make a string naming the generator, the input files and copyright info
 358 #
 359 my $from = "Generated " . localtime() . " by " . basename $0;
 360 $from =~ s/[\s]+/ /g;
 361 if ($files) {
 362     if ($files =~ /:/) {
 363         $from .= " from the following files:";
 364     } else {
 365         $from .= " from work with the following copyright:";
 366     }
 367 }
 368 $from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g);  # word-wrap
 369 $from .= "\n$files" if $files;
 370
 371 # Now start to print the code
 372 #
 373 begin_output $from, $instructions;
 374
 375 # numbers for 'extra' sections
 376 my $extra = 1;
 377
 378 for (@input)
 379 {
 380     # Print the main tests
 381     #
 382     # Test lines look like this:
 383     # m  3  b       {\(a\)b}        ab      ab      a
 384     #
 385     # Also looks for heading lines, e.g.:
 386     # doing 4 "parentheses"
 387     #
 388     for (split "\n") {
 389         if (/^doing\s+(\S+)\s+(\S.*)/) {
 390             handle_doing parsetcl "$1 $2";
 391         } elsif (/^[efimp]\s/) {
 392             handle_test parsetcl $_;
 393         }
 394     }
 395
 396     # Extra tests
 397     #
 398     # The expression below matches something like this:
 399     #   test reg-33.8 {Bug 505048} {
 400     #       regexp -inline {\A\s*[^b]*b} ab
 401     #   } ab
 402     #
 403     # The three subexpressions then return these parts:
 404     #   $extras[$i]     = '{Bug 505048}',
 405     #   $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
 406     #   $extras[$i + 2] = 'ab'
 407     #
 408     my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n       # line 1
 409                   \s*regexp\s+([^\n]+)\n                # line 2
 410                   \}\s*(\S[^\n]*)/gx;                   # line 3
 411
 412     handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
 413
 414     for (my $i = 0; $i < @extras; $i += 3) {
 415         my $id = $extras[$i];
 416
 417         # further parse the middle line into options and the rest (i.e. $args)
 418         my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
 419
 420         my @args = parsetcl $args;
 421         $#args = 1;     # only want the first two
 422
 423         # now handle the options
 424         my $test    = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
 425         my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
 426
 427         # get them all in the right order and print
 428         unshift @args, $test, parsetcl($id), $results ? '-' : 'o';
 429         push @args, parsetcl(parsetcl($results)) if $results;
 430         handle_test @args;
 431     }
 432 }
 433
 434 # finish
 435 #
 436 handle_end;