tests/regex/regex.pl

   1 #!/usr/bin/env perl -w
   2 #############################################################################
   3 # Name:        regex.pl
   4 # Purpose:     Generate test code for wxRegEx from 'reg.test'
   5 # Author:      Mike Wetherell
   6 # RCS-ID:      $Id$
   7 # Copyright:   (c) Mike Wetherell
   8 # Licence:     wxWidgets licence
   9 #############################################################################
  10
  11 #
  12 # Notes:
  13 #   See './regex.pl -h' for usage
  14 #
  15 #   Output at the moment is C++ using the cppunit testing framework. The
  16 #   language/framework specifics are separated, with the following 5
  17 #   subs as an interface: 'begin_output', 'begin_section', 'write_test',
  18 #   'end_section' and 'end_output'. So for a different language/framework,
  19 #   implement 5 new similar subs.
  20 #
  21 #   I've avoided using 'use encoding "UTF-8"', since this wasn't available
  22 #   in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
  23 #   earler than perl 5.6.0 aren't going to work.
  24 #
  25
  26 use strict;
  27 use File::Basename;
  28 #use encoding "UTF-8";  # enable in the future when perl 5.6.x is just a memory
  29
  30 # if 0 output is wide characters, if 1 output is utf8 encoded
  31 my $utf = 1;
  32
  33 # quote a parameter (C++ helper)
  34 #
  35 sub quotecxx {
  36     my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
  37                 "\n" => "n", "\r" => "r", "\t" => "t",
  38                 "\013" => "v", '"' => '"', "\\" => "\\" );
  39
  40     # working around lack of 'use encoding'
  41     $_ = pack "U0C*", unpack "C*", $_;
  42     use utf8;
  43
  44     s/[\000-\037"\\\177-\x{ffff}]/
  45         if ($esc{$&}) {
  46             "\\$esc{$&}";
  47         } elsif (ord($&) > 0x9f) {
  48             if ($utf) {
  49                 $&;
  50             } else {
  51                 sprintf "\\u%04x", ord($&);
  52             }
  53         } else {
  54             sprintf "\\%03o", ord($&);
  55         }
  56     /ge;
  57
  58     # working around lack of 'use encoding'
  59     no utf8;
  60     $_ = pack "C*", unpack "C*", $_;
  61
  62     return ($utf ? '"' : 'L"') . $_ . '"'
  63 }
  64
  65 # start writing the output code (C++ interface)
  66 #
  67 sub begin_output {
  68     my ($from, $instructions) = @_;
  69
  70     # embed it in the comment
  71     $from = "\n$from";
  72     $from =~ s/^(?:   )?/ * /mg;
  73
  74     # $instructions contains information about the flags etc.
  75     if ($instructions) {
  76         $instructions = "\n$instructions";
  77         $instructions =~ s/^(?:   )?/ * /mg;
  78     }
  79
  80     my $u = $utf ? " (UTF-8 encoded)" : "";
  81
  82     print <<EOT;
  83 /*
  84  * Test data for wxRegEx$u
  85 $from$instructions */
  86
  87 EOT
  88 }
  89
  90 my @classes;
  91
  92 # start a new section (C++ interface)
  93 #
  94 sub begin_section {
  95     my ($id, $title) = @_;
  96     my $class = "regextest_$id";
  97     $class =~ s/\W/_/g;
  98     push @classes, [$id, $class];
  99
 100     print <<EOT;
 101
 102 /*
 103  * $id $title
 104  */
 105
 106 class $class : public RegExTestSuite
 107 {
 108 public:
 109     $class() : RegExTestSuite("regex.$id") { }
 110     static Test *suite();
 111 };
 112
 113 Test *$class\::suite()
 114 {
 115     RegExTestSuite *suite = new $class;
 116
 117 EOT
 118 }
 119
 120 # output a test line (C++ interface)
 121 #
 122 sub write_test {
 123     my @args = @_;
 124     $_ = quotecxx for @args;
 125     print "    suite->add(" . (join ', ', @args) . ", NULL);\n";
 126 }
 127
 128 # end a section (C++ interface)
 129 #
 130 sub end_section {
 131     my ($id, $class) = @{$classes[$#classes]};
 132
 133     print <<EOT;
 134
 135     return suite;
 136 }
 137
 138 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
 139
 140 EOT
 141 }
 142
 143 # finish off the output (C++ interface)
 144 #
 145 sub end_output {
 146     print <<EOT;
 147
 148 /*
 149  * A suite containing all the above suites
 150  */
 151
 152 class regextest : public TestSuite
 153 {
 154 public:
 155     regextest() : TestSuite("regex") { }
 156     static Test *suite();
 157 };
 158
 159 Test *regextest::suite()
 160 {
 161     TestSuite *suite = new regextest;
 162
 163 EOT
 164     print "    suite->addTest(".$_->[1]."::suite());\n" for @classes;
 165
 166     print <<EOT;
 167
 168     return suite;
 169 }
 170
 171 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
 172 CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
 173 EOT
 174 }
 175
 176 # Parse a tcl string. Handles curly quoting and double quoting.
 177 #
 178 sub parsetcl {
 179     my ($curly, $quote);
 180     # recursively defined expression that can parse balanced braces
 181     # warning: uses experimental features of perl, see perlop(1)
 182     $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
 183     $quote = qr/"(?:\\"|[^"])*"/;
 184     my @tokens = shift =~ /($curly|$quote|\S+)/g;
 185
 186     # now remove braces/quotes and unescape any escapes
 187     for (@tokens) {
 188         if (s/^{(.*)}$/$1/) {
 189             # for curly quoting, only unescape \{ and \}
 190             s/\\([{}])/$1/g;
 191         } else {
 192             s/^"(.*)"$/$1/;
 193
 194             # unescape any escapes
 195             my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
 196                         "n" => "\n", "r" => "\r", "t" => "\t",
 197                         "v" => "\013" );
 198             my $x = qr/[[:xdigit:]]/;
 199
 200             s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
 201                 if ($1 =~ m{^([0-7]+)}) {
 202                     chr(oct($1));
 203                 } elsif ($1 =~ m{^x($x+)}) {
 204                     pack("C0U", hex($1) & 0xff);
 205                 } elsif ($1 =~ m{^u($x+)}) {
 206                     pack("C0U", hex($1));
 207                 } elsif ($esc{$1}) {
 208                     $esc{$1};
 209                 } else {
 210                     $1;
 211                 }
 212             /ge;
 213         }
 214     }
 215
 216     return @tokens;
 217 }
 218
 219 # helpers which keep track of whether begin_section has been called, so that
 220 # end_section can be called when appropriate
 221 #
 222 my @doing = ("0", "");
 223 my $in_section = 0;
 224
 225 sub handle_doing {
 226     end_section if $in_section;
 227     $in_section = 0;
 228     @doing = @_;
 229 }
 230
 231 sub handle_test {
 232     begin_section(@doing) if !$in_section;
 233     $in_section = 1;
 234     write_test @_;
 235 }
 236
 237 sub handle_end {
 238     end_section if $in_section;
 239     $in_section = 0;
 240     end_output;
 241 }
 242
 243 # 'main' - start by parsing the command lines options.
 244 #
 245 my $badoption = !@ARGV;
 246 my $utfdefault = $utf;
 247 my $outputname;
 248
 249 for (my $i = 0; $i < @ARGV; ) {
 250     if ($ARGV[$i] !~ m{^-.}) {
 251         $i++;
 252         next;
 253     }
 254
 255     if ($ARGV[$i] eq '--') {
 256         splice @ARGV, $i, 1;
 257         last;
 258     }
 259
 260     if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) {       # -o : output file
 261         $outputname = $2 || splice @ARGV, $i + 1, 1;
 262     }
 263
 264     for (split //, substr($ARGV[$i], 1)) {
 265         if (/u/i) {                                 # -u : utf-8 output
 266             $utf = 1;
 267         } elsif (/w/i) {                            # -w : wide char output
 268             $utf = 0;
 269         } else {
 270             $badoption = 1;
 271         }
 272     }
 273
 274     splice @ARGV, $i, 1;
 275 }
 276
 277 # Display help
 278 #
 279 if ($badoption) {
 280     my $prog = basename $0;
 281     my ($w, $u) = (" (default)", "          ");
 282     ($w, $u) = ($u, $w) if $utfdefault;
 283
 284     print <<EOT;
 285 Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
 286 Generate test code for wxRegEx from 'reg.test'
 287 Example: $prog -o regex.inc reg.test wxreg.test
 288
 289  -w$w   Output will be wide characters.
 290  -u$u   Output will be UTF-8 encoded.
 291
 292 Input files should be in UTF-8. If no input files are specified input is
 293 read from stdin. If no output file is specified output is written to stdout.
 294 See the comments in reg.test (in src/regex) for details of the input file
 295 format.
 296 EOT
 297     exit 0;
 298 }
 299
 300 # Open the output file
 301 #
 302 open STDOUT, ">$outputname" if $outputname;
 303
 304 # Read in the files and initially parse just the comments for copyright
 305 # information and instructions on the tests
 306 #
 307 my @input;                  # slurped input files stripped of comments
 308 my $files = "";             # copyright info from the input comments
 309 my $instructions = "";      # test instructions from the input comments
 310
 311 do {
 312     my $inputname = basename $ARGV[0] if @ARGV;
 313
 314     # slurp input
 315     undef $/;
 316     my $in = <>;
 317
 318     # remove escaped newlines
 319     $in =~ s/(?<!\\)\\\n//g;
 320
 321     # record the copyrights of the input files
 322     for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
 323         s/[\s:]+/ /g;
 324         $files .= "  ";
 325         $files .= $inputname . ": " if $inputname && $inputname ne '-';
 326         $files .= "$_\n";
 327     }
 328
 329     # Parse the comments for instructions on the tests, which look like this:
 330     #    i    successful match with -indices (used in checking things like
 331     #         nonparticipating subexpressions)
 332     if (!$instructions) {
 333         my $sp = qr{\t|   +};                   # tab or three or more spaces
 334         my @instructions = $in =~
 335             /\n(
 336                 (?:
 337                     \#$sp\S?$sp\S[^\n]+\n       # instruction line
 338                     (?:\#$sp$sp\S[^\n]+\n)*     # continuation lines (if any)
 339                 )+
 340             )/gx;
 341
 342         if (@instructions) {
 343             $instructions[0] = "Test types:\n$instructions[0]";
 344             if (@instructions > 1) {
 345                 $instructions[1] = "Flag characters:\n$instructions[1]";
 346             }
 347             $instructions = join "\n", @instructions;
 348             $instructions =~ s/^#([^\t]?)/ $1/mg;
 349         }
 350     }
 351
 352     # @input is the input of all files (stipped of comments)
 353     $in =~ s/^#.*$//mg;
 354     push @input, $in;
 355
 356 } while $ARGV[0];
 357
 358 # Make a string naming the generator, the input files and copyright info
 359 #
 360 my $from = "Generated " . localtime() . " by " . basename $0;
 361 $from =~ s/[\s]+/ /g;
 362 if ($files) {
 363     if ($files =~ /:/) {
 364         $from .= " from the following files:";
 365     } else {
 366         $from .= " from work with the following copyright:";
 367     }
 368 }
 369 $from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g);  # word-wrap
 370 $from .= "\n$files" if $files;
 371
 372 # Now start to print the code
 373 #
 374 begin_output $from, $instructions;
 375
 376 # numbers for 'extra' sections
 377 my $extra = 1;
 378
 379 for (@input)
 380 {
 381     # Print the main tests
 382     #
 383     # Test lines look like this:
 384     # m  3  b       {\(a\)b}        ab      ab      a
 385     #
 386     # Also looks for heading lines, e.g.:
 387     # doing 4 "parentheses"
 388     #
 389     for (split "\n") {
 390         if (/^doing\s+(\S+)\s+(\S.*)/) {
 391             handle_doing parsetcl "$1 $2";
 392         } elsif (/^[efimp]\s/) {
 393             handle_test parsetcl $_;
 394         }
 395     }
 396
 397     # Extra tests
 398     #
 399     # The expression below matches something like this:
 400     #   test reg-33.8 {Bug 505048} {
 401     #       regexp -inline {\A\s*[^b]*b} ab
 402     #   } ab
 403     #
 404     # The three subexpressions then return these parts:
 405     #   $extras[$i]     = '{Bug 505048}',
 406     #   $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
 407     #   $extras[$i + 2] = 'ab'
 408     #
 409     my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n       # line 1
 410                   \s*regexp\s+([^\n]+)\n                # line 2
 411                   \}\s*(\S[^\n]*)/gx;                   # line 3
 412
 413     handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
 414
 415     for (my $i = 0; $i < @extras; $i += 3) {
 416         my $id = $extras[$i];
 417
 418         # further parse the middle line into options and the rest (i.e. $args)
 419         my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
 420
 421         my @args = parsetcl $args;
 422         $#args = 1;     # only want the first two
 423
 424         # now handle the options
 425         my $test    = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
 426         my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
 427
 428         # get them all in the right order and print
 429         unshift @args, $test, parsetcl($id), '-';
 430         push @args, parsetcl(parsetcl($results)) if $results;
 431         handle_test @args;
 432     }
 433 }
 434
 435 # finish
 436 #
 437 handle_end;