| 1 | #!/usr/bin/env perl |
| 2 | ############################################################################# |
| 3 | # Name: regex.pl |
| 4 | # Purpose: Generate test code for wxRegEx from 'reg.test' |
| 5 | # Author: Mike Wetherell |
| 6 | # RCS-ID: $Id$ |
| 7 | # Copyright: (c) Mike Wetherell |
| 8 | # Licence: wxWidgets licence |
| 9 | ############################################################################# |
| 10 | |
| 11 | # |
| 12 | # Notes: |
| 13 | # See './regex.pl -h' for usage |
| 14 | # |
| 15 | # Output at the moment is C++ using the cppunit testing framework. The |
| 16 | # language/framework specifics are separated, with the following 5 |
| 17 | # subs as an interface: 'begin_output', 'begin_section', 'write_test', |
| 18 | # 'end_section' and 'end_output'. So for a different language/framework, |
| 19 | # implement 5 new similar subs. |
| 20 | # |
| 21 | # I've avoided using 'use encoding "UTF-8"', since this wasn't available |
| 22 | # in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions |
| 23 | # earler than perl 5.6.0 aren't going to work. |
| 24 | # |
| 25 | |
| 26 | use strict; |
| 27 | use warnings; |
| 28 | use File::Basename; |
| 29 | #use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory |
| 30 | |
| 31 | # if 0 output is wide characters, if 1 output is utf8 encoded |
| 32 | my $utf = 1; |
| 33 | |
| 34 | # quote a parameter (C++ helper) |
| 35 | # |
| 36 | sub quotecxx { |
| 37 | my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f", |
| 38 | "\n" => "n", "\r" => "r", "\t" => "t", |
| 39 | "\013" => "v", '"' => '"', "\\" => "\\" ); |
| 40 | |
| 41 | # working around lack of 'use encoding' |
| 42 | if (!$utf) { |
| 43 | $_ = pack "U0C*", unpack "C*", $_; |
| 44 | use utf8; |
| 45 | } |
| 46 | |
| 47 | s/[\000-\037"\\\177-\x{ffff}]/ |
| 48 | if ($esc{$&}) { |
| 49 | "\\$esc{$&}"; |
| 50 | } elsif (ord($&) > 0x9f && !$utf) { |
| 51 | sprintf "\\u%04x", ord($&); |
| 52 | } else { |
| 53 | sprintf "\\%03o", ord($&); |
| 54 | } |
| 55 | /ge; |
| 56 | |
| 57 | # working around lack of 'use encoding' |
| 58 | if (!$utf) { |
| 59 | no utf8; |
| 60 | $_ = pack "C*", unpack "C*", $_; |
| 61 | } |
| 62 | |
| 63 | return ($utf ? '"' : 'L"') . $_ . '"' |
| 64 | } |
| 65 | |
| 66 | # start writing the output code (C++ interface) |
| 67 | # |
| 68 | sub begin_output { |
| 69 | my ($from, $instructions) = @_; |
| 70 | |
| 71 | # embed it in the comment |
| 72 | $from = "\n$from"; |
| 73 | $from =~ s/^(?: )?/ * /mg; |
| 74 | |
| 75 | # $instructions contains information about the flags etc. |
| 76 | if ($instructions) { |
| 77 | $instructions = "\n$instructions"; |
| 78 | $instructions =~ s/^(?: )?/ * /mg; |
| 79 | } |
| 80 | |
| 81 | my $u = $utf ? " (UTF-8 encoded)" : ""; |
| 82 | |
| 83 | print <<EOT; |
| 84 | /* |
| 85 | * Test data for wxRegEx$u |
| 86 | $from$instructions */ |
| 87 | |
| 88 | EOT |
| 89 | } |
| 90 | |
| 91 | my @classes; |
| 92 | |
| 93 | # start a new section (C++ interface) |
| 94 | # |
| 95 | sub begin_section { |
| 96 | my ($id, $title) = @_; |
| 97 | my $class = "regextest_$id"; |
| 98 | $class =~ s/\W/_/g; |
| 99 | push @classes, [$id, $class]; |
| 100 | |
| 101 | print <<EOT; |
| 102 | |
| 103 | /* |
| 104 | * $id $title |
| 105 | */ |
| 106 | |
| 107 | class $class : public RegExTestSuite |
| 108 | { |
| 109 | public: |
| 110 | $class() : RegExTestSuite("regex.$id") { } |
| 111 | static Test *suite(); |
| 112 | }; |
| 113 | |
| 114 | Test *$class\::suite() |
| 115 | { |
| 116 | RegExTestSuite *suite = new $class; |
| 117 | |
| 118 | EOT |
| 119 | } |
| 120 | |
| 121 | # output a test line (C++ interface) |
| 122 | # |
| 123 | sub write_test { |
| 124 | my @args = @_; |
| 125 | $_ = quotecxx for @args; |
| 126 | print " suite->add(" . (join ', ', @args) . ", NULL);\n"; |
| 127 | } |
| 128 | |
| 129 | # end a section (C++ interface) |
| 130 | # |
| 131 | sub end_section { |
| 132 | my ($id, $class) = @{$classes[$#classes]}; |
| 133 | |
| 134 | print <<EOT; |
| 135 | |
| 136 | return suite; |
| 137 | } |
| 138 | |
| 139 | CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id"); |
| 140 | |
| 141 | EOT |
| 142 | } |
| 143 | |
| 144 | # finish off the output (C++ interface) |
| 145 | # |
| 146 | sub end_output { |
| 147 | print <<EOT; |
| 148 | |
| 149 | /* |
| 150 | * A suite containing all the above suites |
| 151 | */ |
| 152 | |
| 153 | class regextest : public TestSuite |
| 154 | { |
| 155 | public: |
| 156 | regextest() : TestSuite("regex") { } |
| 157 | static Test *suite(); |
| 158 | }; |
| 159 | |
| 160 | Test *regextest::suite() |
| 161 | { |
| 162 | TestSuite *suite = new regextest; |
| 163 | |
| 164 | EOT |
| 165 | print " suite->addTest(".$_->[1]."::suite());\n" for @classes; |
| 166 | |
| 167 | print <<EOT; |
| 168 | |
| 169 | return suite; |
| 170 | } |
| 171 | |
| 172 | CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex"); |
| 173 | CPPUNIT_TEST_SUITE_REGISTRATION(regextest); |
| 174 | EOT |
| 175 | } |
| 176 | |
| 177 | # Parse a tcl string. Handles curly quoting and double quoting. |
| 178 | # |
| 179 | sub parsetcl { |
| 180 | my ($curly, $quote); |
| 181 | # recursively defined expression that can parse balanced braces |
| 182 | # warning: uses experimental features of perl, see perlop(1) |
| 183 | $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/; |
| 184 | $quote = qr/"(?:\\"|[^"])*"/; |
| 185 | my @tokens = shift =~ /($curly|$quote|\S+)/g; |
| 186 | |
| 187 | # now remove braces/quotes and unescape any escapes |
| 188 | for (@tokens) { |
| 189 | if (s/^{(.*)}$/$1/) { |
| 190 | # for curly quoting, only unescape \{ and \} |
| 191 | s/\\([{}])/$1/g; |
| 192 | } else { |
| 193 | s/^"(.*)"$/$1/; |
| 194 | |
| 195 | # unescape any escapes |
| 196 | my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f", |
| 197 | "n" => "\n", "r" => "\r", "t" => "\t", |
| 198 | "v" => "\013" ); |
| 199 | my $x = qr/[[:xdigit:]]/; |
| 200 | |
| 201 | s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/ |
| 202 | if ($1 =~ m{^([0-7]+)}) { |
| 203 | chr(oct($1)); |
| 204 | } elsif ($1 =~ m{^x($x+)}) { |
| 205 | pack("C0U", hex($1) & 0xff); |
| 206 | } elsif ($1 =~ m{^u($x+)}) { |
| 207 | pack("C0U", hex($1)); |
| 208 | } elsif ($esc{$1}) { |
| 209 | $esc{$1}; |
| 210 | } else { |
| 211 | $1; |
| 212 | } |
| 213 | /ge; |
| 214 | } |
| 215 | } |
| 216 | |
| 217 | return @tokens; |
| 218 | } |
| 219 | |
| 220 | # helpers which keep track of whether begin_section has been called, so that |
| 221 | # end_section can be called when appropriate |
| 222 | # |
| 223 | my @doing = ("0", ""); |
| 224 | my $in_section = 0; |
| 225 | |
| 226 | sub handle_doing { |
| 227 | end_section if $in_section; |
| 228 | $in_section = 0; |
| 229 | @doing = @_; |
| 230 | } |
| 231 | |
| 232 | sub handle_test { |
| 233 | begin_section(@doing) if !$in_section; |
| 234 | $in_section = 1; |
| 235 | write_test @_; |
| 236 | } |
| 237 | |
| 238 | sub handle_end { |
| 239 | end_section if $in_section; |
| 240 | $in_section = 0; |
| 241 | end_output; |
| 242 | } |
| 243 | |
| 244 | # 'main' - start by parsing the command lines options. |
| 245 | # |
| 246 | my $badoption = !@ARGV; |
| 247 | my $utfdefault = $utf; |
| 248 | my $outputname; |
| 249 | |
| 250 | for (my $i = 0; $i < @ARGV; ) { |
| 251 | if ($ARGV[$i] !~ m{^-.}) { |
| 252 | $i++; |
| 253 | next; |
| 254 | } |
| 255 | |
| 256 | if ($ARGV[$i] eq '--') { |
| 257 | splice @ARGV, $i, 1; |
| 258 | last; |
| 259 | } |
| 260 | |
| 261 | if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) { # -o : output file |
| 262 | $outputname = $2 || splice @ARGV, $i + 1, 1; |
| 263 | } |
| 264 | |
| 265 | for (split //, substr($ARGV[$i], 1)) { |
| 266 | if (/u/i) { # -u : utf-8 output |
| 267 | $utf = 1; |
| 268 | } elsif (/w/i) { # -w : wide char output |
| 269 | $utf = 0; |
| 270 | } else { |
| 271 | $badoption = 1; |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | splice @ARGV, $i, 1; |
| 276 | } |
| 277 | |
| 278 | # Display help |
| 279 | # |
| 280 | if ($badoption) { |
| 281 | my $prog = basename $0; |
| 282 | my ($w, $u) = (" (default)", " "); |
| 283 | ($w, $u) = ($u, $w) if $utfdefault; |
| 284 | |
| 285 | print <<EOT; |
| 286 | Usage: $prog [-u|-w] [-o OUTPUT] [FILE...] |
| 287 | Generate test code for wxRegEx from 'reg.test' |
| 288 | Example: $prog -o regex.inc reg.test wxreg.test |
| 289 | |
| 290 | -w$w Output will be wide characters. |
| 291 | -u$u Output will be UTF-8 encoded. |
| 292 | |
| 293 | Input files should be in UTF-8. If no input files are specified input is |
| 294 | read from stdin. If no output file is specified output is written to stdout. |
| 295 | See the comments in reg.test for details of the input file format. |
| 296 | EOT |
| 297 | exit 0; |
| 298 | } |
| 299 | |
| 300 | # Open the output file |
| 301 | # |
| 302 | open STDOUT, ">$outputname" if $outputname; |
| 303 | |
| 304 | # Read in the files and initially parse just the comments for copyright |
| 305 | # information and instructions on the tests |
| 306 | # |
| 307 | my @input; # slurped input files stripped of comments |
| 308 | my $files = ""; # copyright info from the input comments |
| 309 | my $instructions = ""; # test instructions from the input comments |
| 310 | |
| 311 | do { |
| 312 | my $inputname = basename $ARGV[0] if @ARGV; |
| 313 | |
| 314 | # slurp input |
| 315 | undef $/; |
| 316 | my $in = <>; |
| 317 | |
| 318 | # remove escaped newlines |
| 319 | $in =~ s/(?<!\\)\\\n//g; |
| 320 | |
| 321 | # record the copyrights of the input files |
| 322 | for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) { |
| 323 | s/[\s:]+/ /g; |
| 324 | $files .= " "; |
| 325 | $files .= $inputname . ": " if $inputname && $inputname ne '-'; |
| 326 | $files .= "$_\n"; |
| 327 | } |
| 328 | |
| 329 | # Parse the comments for instructions on the tests, which look like this: |
| 330 | # i successful match with -indices (used in checking things like |
| 331 | # nonparticipating subexpressions) |
| 332 | if (!$instructions) { |
| 333 | my $sp = qr{\t| +}; # tab or three or more spaces |
| 334 | my @instructions = $in =~ |
| 335 | /\n( |
| 336 | (?: |
| 337 | \#$sp\S?$sp\S[^\n]+\n # instruction line |
| 338 | (?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any) |
| 339 | )+ |
| 340 | )/gx; |
| 341 | |
| 342 | if (@instructions) { |
| 343 | $instructions[0] = "Test types:\n$instructions[0]"; |
| 344 | if (@instructions > 1) { |
| 345 | $instructions[1] = "Flag characters:\n$instructions[1]"; |
| 346 | } |
| 347 | $instructions = join "\n", @instructions; |
| 348 | $instructions =~ s/^#([^\t]?)/ $1/mg; |
| 349 | } |
| 350 | } |
| 351 | |
| 352 | # @input is the input of all files (stipped of comments) |
| 353 | $in =~ s/^#.*$//mg; |
| 354 | push @input, $in; |
| 355 | |
| 356 | } while $ARGV[0]; |
| 357 | |
| 358 | # Make a string naming the generator, the input files and copyright info |
| 359 | # |
| 360 | my $from = "Generated " . localtime() . " by " . basename $0; |
| 361 | $from =~ s/[\s]+/ /g; |
| 362 | if ($files) { |
| 363 | if ($files =~ /:/) { |
| 364 | $from .= " from the following files:"; |
| 365 | } else { |
| 366 | $from .= " from work with the following copyright:"; |
| 367 | } |
| 368 | } |
| 369 | $from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g); # word-wrap |
| 370 | $from .= "\n$files" if $files; |
| 371 | |
| 372 | # Now start to print the code |
| 373 | # |
| 374 | begin_output $from, $instructions; |
| 375 | |
| 376 | # numbers for 'extra' sections |
| 377 | my $extra = 1; |
| 378 | |
| 379 | for (@input) |
| 380 | { |
| 381 | # Print the main tests |
| 382 | # |
| 383 | # Test lines look like this: |
| 384 | # m 3 b {\(a\)b} ab ab a |
| 385 | # |
| 386 | # Also looks for heading lines, e.g.: |
| 387 | # doing 4 "parentheses" |
| 388 | # |
| 389 | for (split "\n") { |
| 390 | if (/^doing\s+(\S+)\s+(\S.*)/) { |
| 391 | handle_doing parsetcl "$1 $2"; |
| 392 | } elsif (/^[efimp]\s/) { |
| 393 | handle_test parsetcl $_; |
| 394 | } |
| 395 | } |
| 396 | |
| 397 | # Extra tests |
| 398 | # |
| 399 | # The expression below matches something like this: |
| 400 | # test reg-33.8 {Bug 505048} { |
| 401 | # regexp -inline {\A\s*[^b]*b} ab |
| 402 | # } ab |
| 403 | # |
| 404 | # The three subexpressions then return these parts: |
| 405 | # $extras[$i] = '{Bug 505048}', |
| 406 | # $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab' |
| 407 | # $extras[$i + 2] = 'ab' |
| 408 | # |
| 409 | my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n # line 1 |
| 410 | \s*regexp\s+([^\n]+)\n # line 2 |
| 411 | \}\s*(\S[^\n]*)/gx; # line 3 |
| 412 | |
| 413 | handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras; |
| 414 | |
| 415 | for (my $i = 0; $i < @extras; $i += 3) { |
| 416 | my $id = $extras[$i]; |
| 417 | |
| 418 | # further parse the middle line into options and the rest (i.e. $args) |
| 419 | my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/; |
| 420 | |
| 421 | my @args = parsetcl $args; |
| 422 | $#args = 1; # only want the first two |
| 423 | |
| 424 | # now handle the options |
| 425 | my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f'; |
| 426 | my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : ''; |
| 427 | |
| 428 | # get them all in the right order and print |
| 429 | unshift @args, $test, parsetcl($id), $results ? '-' : 'o'; |
| 430 | push @args, parsetcl(parsetcl($results)) if $results; |
| 431 | handle_test @args; |
| 432 | } |
| 433 | } |
| 434 | |
| 435 | # finish |
| 436 | # |
| 437 | handle_end; |