--- /dev/null
+#!/usr/bin/env perl -w
+#############################################################################
+# Name: regex.pl
+# Purpose: Generate test code for wxRegEx from 'reg.test'
+# Author: Mike Wetherell
+# RCS-ID: $Id$
+# Copyright: (c) Mike Wetherell
+# Licence: wxWidgets licence
+#############################################################################
+
+#
+# Notes:
+# See './regex.pl -h' for usage
+#
+# Output at the moment is C++ using the cppunit testing framework. The
+# language/framework specifics are separated, with the following 5
+# subs as an interface: 'begin_output', 'begin_section', 'write_test',
+# 'end_section' and 'end_output'. So for a different language/framework,
+# implement 5 new similar subs.
+#
+# I've avoided using 'use encoding "UTF-8"', since this wasn't available
+# in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
+# earler than perl 5.6.0 aren't going to work.
+#
+
+use strict;
+use File::Basename;
+#use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory
+
+# if 0 output is wide characters, if 1 output is utf8 encoded
+my $utf = 1;
+
+# quote a parameter (C++ helper)
+#
+sub quotecxx {
+ my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
+ "\n" => "n", "\r" => "r", "\t" => "t",
+ "\013" => "v", '"' => '"', "\\" => "\\" );
+
+ # working around lack of 'use encoding'
+ $_ = pack "U0C*", unpack "C*", $_;
+ use utf8;
+
+ s/[\000-\037"\\\177-\x{ffff}]/
+ if ($esc{$&}) {
+ "\\$esc{$&}";
+ } elsif (ord($&) > 0x9f) {
+ if ($utf) {
+ $&;
+ } else {
+ sprintf "\\u%04x", ord($&);
+ }
+ } else {
+ sprintf "\\%03o", ord($&);
+ }
+ /ge;
+
+ # working around lack of 'use encoding'
+ no utf8;
+ $_ = pack "C*", unpack "C*", $_;
+
+ return ($utf ? '"' : 'L"') . $_ . '"'
+}
+
+# start writing the output code (C++ interface)
+#
+sub begin_output {
+ my ($from, $instructions) = @_;
+
+ # embed it in the comment
+ $from = "\n$from";
+ $from =~ s/^(?: )?/ * /mg;
+
+ # $instructions contains information about the flags etc.
+ if ($instructions) {
+ $instructions = "\n$instructions";
+ $instructions =~ s/^(?: )?/ * /mg;
+ }
+
+ my $u = $utf ? " (UTF-8 encoded)" : "";
+
+ print <<EOT;
+/*
+ * Test data for wxRegEx$u
+$from$instructions */
+
+EOT
+}
+
+my @classes;
+
+# start a new section (C++ interface)
+#
+sub begin_section {
+ my ($id, $title) = @_;
+ my $class = "regextest_$id";
+ $class =~ s/\W/_/g;
+ push @classes, [$id, $class];
+
+ print <<EOT;
+
+/*
+ * $id $title
+ */
+
+class $class : public RegExTestSuite
+{
+public:
+ $class() : RegExTestSuite("regex.$id") { }
+ static Test *suite();
+};
+
+Test *$class\::suite()
+{
+ RegExTestSuite *suite = new $class;
+
+EOT
+}
+
+# output a test line (C++ interface)
+#
+sub write_test {
+ my @args = @_;
+ $_ = quotecxx for @args;
+ print " suite->add(" . (join ', ', @args) . ", NULL);\n";
+}
+
+# end a section (C++ interface)
+#
+sub end_section {
+ my ($id, $class) = @{$classes[$#classes]};
+
+ print <<EOT;
+
+ return suite;
+}
+
+CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
+
+EOT
+}
+
+# finish off the output (C++ interface)
+#
+sub end_output {
+ print <<EOT;
+
+/*
+ * A suite containing all the above suites
+ */
+
+class regextest : public TestSuite
+{
+public:
+ regextest() : TestSuite("regex") { }
+ static Test *suite();
+};
+
+Test *regextest::suite()
+{
+ TestSuite *suite = new regextest;
+
+EOT
+ print " suite->addTest(".$_->[1]."::suite());\n" for @classes;
+
+ print <<EOT;
+
+ return suite;
+}
+
+CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
+CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
+EOT
+}
+
+# Parse a tcl string. Handles curly quoting and double quoting.
+#
+sub parsetcl {
+ my ($curly, $quote);
+ # recursively defined expression that can parse balanced braces
+ # warning: uses experimental features of perl, see perlop(1)
+ $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
+ $quote = qr/"(?:\\"|[^"])*"/;
+ my @tokens = shift =~ /($curly|$quote|\S+)/g;
+
+ # now remove braces/quotes and unescape any escapes
+ for (@tokens) {
+ if (s/^{(.*)}$/$1/) {
+ # for curly quoting, only unescape \{ and \}
+ s/\\([{}])/$1/g;
+ } else {
+ s/^"(.*)"$/$1/;
+
+ # unescape any escapes
+ my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
+ "n" => "\n", "r" => "\r", "t" => "\t",
+ "v" => "\013" );
+ my $x = qr/[[:xdigit:]]/;
+
+ s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
+ if ($1 =~ m{^([0-7]+)}) {
+ chr(oct($1));
+ } elsif ($1 =~ m{^x($x+)}) {
+ pack("C0U", hex($1) & 0xff);
+ } elsif ($1 =~ m{^u($x+)}) {
+ pack("C0U", hex($1));
+ } elsif ($esc{$1}) {
+ $esc{$1};
+ } else {
+ $1;
+ }
+ /ge;
+ }
+ }
+
+ return @tokens;
+}
+
+# helpers which keep track of whether begin_section has been called, so that
+# end_section can be called when appropriate
+#
+my @doing = ("0", "");
+my $in_section = 0;
+
+sub handle_doing {
+ end_section if $in_section;
+ $in_section = 0;
+ @doing = @_;
+}
+
+sub handle_test {
+ begin_section(@doing) if !$in_section;
+ $in_section = 1;
+ write_test @_;
+}
+
+sub handle_end {
+ end_section if $in_section;
+ $in_section = 0;
+ end_output;
+}
+
+# 'main' - start by parsing the command lines options.
+#
+my $badoption = !@ARGV;
+my $utfdefault = $utf;
+my $outputname;
+
+for (my $i = 0; $i < @ARGV; ) {
+ if ($ARGV[$i] !~ m{^-.}) {
+ $i++;
+ next;
+ }
+
+ if ($ARGV[$i] eq '--') {
+ splice @ARGV, $i, 1;
+ last;
+ }
+
+ if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) { # -o : output file
+ $outputname = $2 || splice @ARGV, $i + 1, 1;
+ }
+
+ for (split //, substr($ARGV[$i], 1)) {
+ if (/u/i) { # -u : utf-8 output
+ $utf = 1;
+ } elsif (/w/i) { # -w : wide char output
+ $utf = 0;
+ } else {
+ $badoption = 1;
+ }
+ }
+
+ splice @ARGV, $i, 1;
+}
+
+# Display help
+#
+if ($badoption) {
+ my $prog = basename $0;
+ my ($w, $u) = (" (default)", " ");
+ ($w, $u) = ($u, $w) if $utfdefault;
+
+ print <<EOT;
+Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
+Generate test code for wxRegEx from 'reg.test'
+Example: $prog -o regex.inc reg.test wxreg.test
+
+ -w$w Output will be wide characters.
+ -u$u Output will be UTF-8 encoded.
+
+Input files should be in UTF-8. If no input files are specified input is
+read from stdin. If no output file is specified output is written to stdout.
+See the comments in reg.test (in src/regex) for details of the input file
+format.
+EOT
+ exit 0;
+}
+
+# Open the output file
+#
+open STDOUT, ">$outputname" if $outputname;
+
+# Read in the files and initially parse just the comments for copyright
+# information and instructions on the tests
+#
+my @input; # slurped input files stripped of comments
+my $files = ""; # copyright info from the input comments
+my $instructions = ""; # test instructions from the input comments
+
+do {
+ my $inputname = basename $ARGV[0] if @ARGV;
+
+ # slurp input
+ undef $/;
+ my $in = <>;
+
+ # remove escaped newlines
+ $in =~ s/(?<!\\)\\\n//g;
+
+ # record the copyrights of the input files
+ for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
+ s/[\s:]+/ /g;
+ $files .= " ";
+ $files .= $inputname . ": " if $inputname && $inputname ne '-';
+ $files .= "$_\n";
+ }
+
+ # Parse the comments for instructions on the tests, which look like this:
+ # i successful match with -indices (used in checking things like
+ # nonparticipating subexpressions)
+ if (!$instructions) {
+ my $sp = qr{\t| +}; # tab or three or more spaces
+ my @instructions = $in =~
+ /\n(
+ (?:
+ \#$sp\S?$sp\S[^\n]+\n # instruction line
+ (?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any)
+ )+
+ )/gx;
+
+ if (@instructions) {
+ $instructions[0] = "Test types:\n$instructions[0]";
+ if (@instructions > 1) {
+ $instructions[1] = "Flag characters:\n$instructions[1]";
+ }
+ $instructions = join "\n", @instructions;
+ $instructions =~ s/^#([^\t]?)/ $1/mg;
+ }
+ }
+
+ # @input is the input of all files (stipped of comments)
+ $in =~ s/^#.*$//mg;
+ push @input, $in;
+
+} while $ARGV[0];
+
+# Make a string naming the generator, the input files and copyright info
+#
+my $from = "Generated " . localtime() . " by " . basename $0;
+$from =~ s/[\s]+/ /g;
+if ($files) {
+ if ($files =~ /:/) {
+ $from .= " from the following files:";
+ } else {
+ $from .= " from work with the following copyright:";
+ }
+}
+$from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g); # word-wrap
+$from .= "\n$files" if $files;
+
+# Now start to print the code
+#
+begin_output $from, $instructions;
+
+# numbers for 'extra' sections
+my $extra = 1;
+
+for (@input)
+{
+ # Print the main tests
+ #
+ # Test lines look like this:
+ # m 3 b {\(a\)b} ab ab a
+ #
+ # Also looks for heading lines, e.g.:
+ # doing 4 "parentheses"
+ #
+ for (split "\n") {
+ if (/^doing\s+(\S+)\s+(\S.*)/) {
+ handle_doing parsetcl "$1 $2";
+ } elsif (/^[efimp]\s/) {
+ handle_test parsetcl $_;
+ }
+ }
+
+ # Extra tests
+ #
+ # The expression below matches something like this:
+ # test reg-33.8 {Bug 505048} {
+ # regexp -inline {\A\s*[^b]*b} ab
+ # } ab
+ #
+ # The three subexpressions then return these parts:
+ # $extras[$i] = '{Bug 505048}',
+ # $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
+ # $extras[$i + 2] = 'ab'
+ #
+ my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n # line 1
+ \s*regexp\s+([^\n]+)\n # line 2
+ \}\s*(\S[^\n]*)/gx; # line 3
+
+ handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
+
+ for (my $i = 0; $i < @extras; $i += 3) {
+ my $id = $extras[$i];
+
+ # further parse the middle line into options and the rest (i.e. $args)
+ my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
+
+ my @args = parsetcl $args;
+ $#args = 1; # only want the first two
+
+ # now handle the options
+ my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
+ my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
+
+ # get them all in the right order and print
+ unshift @args, $test, parsetcl($id), '-';
+ push @args, parsetcl(parsetcl($results)) if $results;
+ handle_test @args;
+ }
+}
+
+# finish
+#
+handle_end;