tests/regex/regex.pl

   1 #!/usr/bin/env perl
   2 #############################################################################
   3 # Name:        regex.pl
   4 # Purpose:     Generate test code for wxRegEx from 'reg.test'
   5 # Author:      Mike Wetherell
   6 # RCS-ID:      $Id$
   7 # Copyright:   (c) Mike Wetherell
   8 # Licence:     wxWindows licence
   9 #############################################################################
  10
  11 #
  12 # Notes:
  13 #   See './regex.pl -h' for usage
  14 #
  15 #   Output at the moment is C++ using the cppunit testing framework. The
  16 #   language/framework specifics are separated, with the following 5
  17 #   subs as an interface: 'begin_output', 'begin_section', 'write_test',
  18 #   'end_section' and 'end_output'. So for a different language/framework,
  19 #   implement 5 new similar subs.
  20 #
  21 #   I've avoided using 'use encoding "UTF-8"', since this wasn't available
  22 #   in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
  23 #   earler than perl 5.6.0 aren't going to work.
  24 #
  25
  26 use strict;
  27 use warnings;
  28 use File::Basename;
  29 #use encoding "UTF-8";  # enable in the future when perl 5.6.x is just a memory
  30
  31 # if 0 output is wide characters, if 1 output is utf8 encoded
  32 my $utf = 1;
  33
  34 # quote a parameter (C++ helper)
  35 #
  36 sub quotecxx {
  37     my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
  38                 "\n" => "n", "\r" => "r", "\t" => "t",
  39                 "\013" => "v", '"' => '"', "\\" => "\\" );
  40
  41     # working around lack of 'use encoding'
  42     if (!$utf) {
  43         $_ = pack "U0C*", unpack "C*", $_;
  44         use utf8;
  45     }
  46
  47     s/[\000-\037"\\\177-\x{ffff}]/
  48         if ($esc{$&}) {
  49             "\\$esc{$&}";
  50         } elsif (ord($&) > 0x9f && !$utf) {
  51             sprintf "\\u%04x", ord($&);
  52         } else {
  53             sprintf "\\%03o", ord($&);
  54         }
  55     /ge;
  56
  57     # working around lack of 'use encoding'
  58     if (!$utf) {
  59         no utf8;
  60         $_ = pack "C*", unpack "C*", $_;
  61     }
  62
  63     return ($utf ? '"' : 'L"') . $_ . '"'
  64 }
  65
  66 # start writing the output code (C++ interface)
  67 #
  68 sub begin_output {
  69     my ($from, $instructions) = @_;
  70
  71     # embed it in the comment
  72     $from = "\n$from";
  73     $from =~ s/^(?:   )?/ * /mg;
  74
  75     # $instructions contains information about the flags etc.
  76     if ($instructions) {
  77         $instructions = "\n$instructions";
  78         $instructions =~ s/^(?:   )?/ * /mg;
  79     }
  80
  81     my $u = $utf ? " (UTF-8 encoded)" : "";
  82
  83     print <<EOT;
  84 /*
  85  * Test data for wxRegEx$u
  86 $from$instructions */
  87
  88 EOT
  89 }
  90
  91 my @classes;
  92
  93 # start a new section (C++ interface)
  94 #
  95 sub begin_section {
  96     my ($id, $title) = @_;
  97     my $class = "regextest_$id";
  98     $class =~ s/\W/_/g;
  99     push @classes, [$id, $class];
 100
 101     print <<EOT;
 102
 103 /*
 104  * $id $title
 105  */
 106
 107 class $class : public RegExTestSuite
 108 {
 109 public:
 110     $class() : RegExTestSuite("regex.$id") { }
 111     static Test *suite();
 112 };
 113
 114 Test *$class\::suite()
 115 {
 116     RegExTestSuite *suite = new $class;
 117
 118 EOT
 119 }
 120
 121 # output a test line (C++ interface)
 122 #
 123 sub write_test {
 124     my @args = @_;
 125     $_ = quotecxx for @args;
 126     print "    suite->add(" . (join ', ', @args) . ", NULL);\n";
 127 }
 128
 129 # end a section (C++ interface)
 130 #
 131 sub end_section {
 132     my ($id, $class) = @{$classes[$#classes]};
 133
 134     print <<EOT;
 135
 136     return suite;
 137 }
 138
 139 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
 140
 141 EOT
 142 }
 143
 144 # finish off the output (C++ interface)
 145 #
 146 sub end_output {
 147     print <<EOT;
 148
 149 /*
 150  * A suite containing all the above suites
 151  */
 152
 153 class regextest : public TestSuite
 154 {
 155 public:
 156     regextest() : TestSuite("regex") { }
 157     static Test *suite();
 158 };
 159
 160 Test *regextest::suite()
 161 {
 162     TestSuite *suite = new regextest;
 163
 164 EOT
 165     print "    suite->addTest(".$_->[1]."::suite());\n" for @classes;
 166
 167     print <<EOT;
 168
 169     return suite;
 170 }
 171
 172 CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
 173 CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
 174 EOT
 175 }
 176
 177 # Parse a tcl string. Handles curly quoting and double quoting.
 178 #
 179 sub parsetcl {
 180     my ($curly, $quote);
 181     # recursively defined expression that can parse balanced braces
 182     # warning: uses experimental features of perl, see perlop(1)
 183     $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
 184     $quote = qr/"(?:\\"|[^"])*"/;
 185     my @tokens = shift =~ /($curly|$quote|\S+)/g;
 186
 187     # now remove braces/quotes and unescape any escapes
 188     for (@tokens) {
 189         if (s/^{(.*)}$/$1/) {
 190             # for curly quoting, only unescape \{ and \}
 191             s/\\([{}])/$1/g;
 192         } else {
 193             s/^"(.*)"$/$1/;
 194
 195             # unescape any escapes
 196             my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
 197                         "n" => "\n", "r" => "\r", "t" => "\t",
 198                         "v" => "\013" );
 199             my $x = qr/[[:xdigit:]]/;
 200
 201             s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
 202                 if ($1 =~ m{^([0-7]+)}) {
 203                     chr(oct($1));
 204                 } elsif ($1 =~ m{^x($x+)}) {
 205                     pack("C0U", hex($1) & 0xff);
 206                 } elsif ($1 =~ m{^u($x+)}) {
 207                     pack("C0U", hex($1));
 208                 } elsif ($esc{$1}) {
 209                     $esc{$1};
 210                 } else {
 211                     $1;
 212                 }
 213             /ge;
 214         }
 215     }
 216
 217     return @tokens;
 218 }
 219
 220 # helpers which keep track of whether begin_section has been called, so that
 221 # end_section can be called when appropriate
 222 #
 223 my @doing = ("0", "");
 224 my $in_section = 0;
 225
 226 sub handle_doing {
 227     end_section if $in_section;
 228     $in_section = 0;
 229     @doing = @_;
 230 }
 231
 232 sub handle_test {
 233     begin_section(@doing) if !$in_section;
 234     $in_section = 1;
 235     write_test @_;
 236 }
 237
 238 sub handle_end {
 239     end_section if $in_section;
 240     $in_section = 0;
 241     end_output;
 242 }
 243
 244 # 'main' - start by parsing the command lines options.
 245 #
 246 my $badoption = !@ARGV;
 247 my $utfdefault = $utf;
 248 my $outputname;
 249
 250 for (my $i = 0; $i < @ARGV; ) {
 251     if ($ARGV[$i] !~ m{^-.}) {
 252         $i++;
 253         next;
 254     }
 255
 256     if ($ARGV[$i] eq '--') {
 257         splice @ARGV, $i, 1;
 258         last;
 259     }
 260
 261     if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) {       # -o : output file
 262         $outputname = $2 || splice @ARGV, $i + 1, 1;
 263     }
 264
 265     for (split //, substr($ARGV[$i], 1)) {
 266         if (/u/i) {                                 # -u : utf-8 output
 267             $utf = 1;
 268         } elsif (/w/i) {                            # -w : wide char output
 269             $utf = 0;
 270         } else {
 271             $badoption = 1;
 272         }
 273     }
 274
 275     splice @ARGV, $i, 1;
 276 }
 277
 278 # Display help
 279 #
 280 if ($badoption) {
 281     my $prog = basename $0;
 282     my ($w, $u) = (" (default)", "          ");
 283     ($w, $u) = ($u, $w) if $utfdefault;
 284
 285     print <<EOT;
 286 Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
 287 Generate test code for wxRegEx from 'reg.test'
 288 Example: $prog -o regex.inc reg.test wxreg.test
 289
 290  -w$w   Output will be wide characters.
 291  -u$u   Output will be UTF-8 encoded.
 292
 293 Input files should be in UTF-8. If no input files are specified input is
 294 read from stdin. If no output file is specified output is written to stdout.
 295 See the comments in reg.test for details of the input file format.
 296 EOT
 297     exit 0;
 298 }
 299
 300 # Open the output file
 301 #
 302 open STDOUT, ">$outputname" if $outputname;
 303
 304 # Read in the files and initially parse just the comments for copyright
 305 # information and instructions on the tests
 306 #
 307 my @input;                  # slurped input files stripped of comments
 308 my $files = "";             # copyright info from the input comments
 309 my $instructions = "";      # test instructions from the input comments
 310
 311 do {
 312     my $inputname = basename $ARGV[0] if @ARGV;
 313
 314     # slurp input
 315     undef $/;
 316     my $in = <>;
 317
 318     # remove escaped newlines
 319     $in =~ s/(?<!\\)\\\n//g;
 320
 321     # record the copyrights of the input files
 322     for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
 323         s/[\s:]+/ /g;
 324         $files .= "  ";
 325         $files .= $inputname . ": " if $inputname && $inputname ne '-';
 326         $files .= "$_\n";
 327     }
 328
 329     # Parse the comments for instructions on the tests, which look like this:
 330     #    i    successful match with -indices (used in checking things like
 331     #         nonparticipating subexpressions)
 332     if (!$instructions) {
 333         my $sp = qr{\t|   +};                   # tab or three or more spaces
 334         my @instructions = $in =~
 335             /\n(
 336                 (?:
 337                     \#$sp\S?$sp\S[^\n]+\n       # instruction line
 338                     (?:\#$sp$sp\S[^\n]+\n)*     # continuation lines (if any)
 339                 )+
 340             )/gx;
 341
 342         if (@instructions) {
 343             $instructions[0] = "Test types:\n$instructions[0]";
 344             if (@instructions > 1) {
 345                 $instructions[1] = "Flag characters:\n$instructions[1]";
 346             }
 347             $instructions = join "\n", @instructions;
 348             $instructions =~ s/^#([^\t]?)/ $1/mg;
 349         }
 350     }
 351
 352     # @input is the input of all files (stipped of comments)
 353     $in =~ s/^#.*$//mg;
 354     push @input, $in;
 355
 356 } while $ARGV[0];
 357
 358 # Make a string naming the generator, the input files and copyright info
 359 #
 360 my $from = "Generated " . localtime() . " by " . basename $0;
 361 $from =~ s/[\s]+/ /g;
 362 if ($files) {
 363     if ($files =~ /:/) {
 364         $from .= " from the following files:";
 365     } else {
 366         $from .= " from work with the following copyright:";
 367     }
 368 }
 369 $from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g);  # word-wrap
 370 $from .= "\n$files" if $files;
 371
 372 # Now start to print the code
 373 #
 374 begin_output $from, $instructions;
 375
 376 # numbers for 'extra' sections
 377 my $extra = 1;
 378
 379 for (@input)
 380 {
 381     # Print the main tests
 382     #
 383     # Test lines look like this:
 384     # m  3  b       {\(a\)b}        ab      ab      a
 385     #
 386     # Also looks for heading lines, e.g.:
 387     # doing 4 "parentheses"
 388     #
 389     for (split "\n") {
 390         if (/^doing\s+(\S+)\s+(\S.*)/) {
 391             handle_doing parsetcl "$1 $2";
 392         } elsif (/^[efimp]\s/) {
 393             handle_test parsetcl $_;
 394         }
 395     }
 396
 397     # Extra tests
 398     #
 399     # The expression below matches something like this:
 400     #   test reg-33.8 {Bug 505048} {
 401     #       regexp -inline {\A\s*[^b]*b} ab
 402     #   } ab
 403     #
 404     # The three subexpressions then return these parts:
 405     #   $extras[$i]     = '{Bug 505048}',
 406     #   $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
 407     #   $extras[$i + 2] = 'ab'
 408     #
 409     my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n       # line 1
 410                   \s*regexp\s+([^\n]+)\n                # line 2
 411                   \}\s*(\S[^\n]*)/gx;                   # line 3
 412
 413     handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
 414
 415     for (my $i = 0; $i < @extras; $i += 3) {
 416         my $id = $extras[$i];
 417
 418         # further parse the middle line into options and the rest (i.e. $args)
 419         my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
 420
 421         my @args = parsetcl $args;
 422         $#args = 1;     # only want the first two
 423
 424         # now handle the options
 425         my $test    = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
 426         my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
 427
 428         # get them all in the right order and print
 429         unshift @args, $test, parsetcl($id), $results ? '-' : 'o';
 430         push @args, parsetcl(parsetcl($results)) if $results;
 431         handle_test @args;
 432     }
 433 }
 434
 435 # finish
 436 #
 437 handle_end;