git.saurik.com Git - wxWidgets.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env perl
	2	#############################################################################
	3	# Name: regex.pl
	4	# Purpose: Generate test code for wxRegEx from 'reg.test'
	5	# Author: Mike Wetherell
	6	# RCS-ID: $Id$
	7	# Copyright: (c) Mike Wetherell
	8	# Licence: wxWidgets licence
	9	#############################################################################
	10
	11	#
	12	# Notes:
	13	# See './regex.pl -h' for usage
	14	#
	15	# Output at the moment is C++ using the cppunit testing framework. The
	16	# language/framework specifics are separated, with the following 5
	17	# subs as an interface: 'begin_output', 'begin_section', 'write_test',
	18	# 'end_section' and 'end_output'. So for a different language/framework,
	19	# implement 5 new similar subs.
	20	#
	21	# I've avoided using 'use encoding "UTF-8"', since this wasn't available
	22	# in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
	23	# earler than perl 5.6.0 aren't going to work.
	24	#
	25
	26	use strict;
	27	use warnings;
	28	use File::Basename;
	29	#use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory
	30
	31	# if 0 output is wide characters, if 1 output is utf8 encoded
	32	my $utf = 1;
	33
	34	# quote a parameter (C++ helper)
	35	#
	36	sub quotecxx {
	37	my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
	38	"\n" => "n", "\r" => "r", "\t" => "t",
	39	"\013" => "v", '"' => '"', "\\" => "\\" );
	40
	41	# working around lack of 'use encoding'
	42	$_ = pack "U0C", unpack "C", $_;
	43	use utf8;
	44
	45	s/[\000-\037"\\\177-\x{ffff}]/
	46	if ($esc{$&}) {
	47	"\\$esc{$&}";
	48	} elsif (ord($&) > 0x9f) {
	49	if ($utf) {
	50	$&;
	51	} else {
	52	sprintf "\\u%04x", ord($&);
	53	}
	54	} else {
	55	sprintf "\\%03o", ord($&);
	56	}
	57	/ge;
	58
	59	# working around lack of 'use encoding'
	60	no utf8;
	61	$_ = pack "C", unpack "C", $_;
	62
	63	return ($utf ? '"' : 'L"') . $_ . '"'
	64	}
	65
	66	# start writing the output code (C++ interface)
	67	#
	68	sub begin_output {
	69	my ($from, $instructions) = @_;
	70
	71	# embed it in the comment
	72	$from = "\n$from";
	73	$from =~ s/^(?: )?/ * /mg;
	74
	75	# $instructions contains information about the flags etc.
	76	if ($instructions) {
	77	$instructions = "\n$instructions";
	78	$instructions =~ s/^(?: )?/ * /mg;
	79	}
	80
	81	my $u = $utf ? " (UTF-8 encoded)" : "";
	82
	83	print <<EOT;
	84	/*
	85	* Test data for wxRegEx$u
	86	$from$instructions */
	87
	88	EOT
	89	}
	90
	91	my @classes;
	92
	93	# start a new section (C++ interface)
	94	#
	95	sub begin_section {
	96	my ($id, $title) = @_;
	97	my $class = "regextest_$id";
	98	$class =~ s/\W/_/g;
	99	push @classes, [$id, $class];
	100
	101	print <<EOT;
	102
	103	/*
	104	* $id $title
	105	*/
	106
	107	class $class : public RegExTestSuite
	108	{
	109	public:
	110	$class() : RegExTestSuite("regex.$id") { }
	111	static Test *suite();
	112	};
	113
	114	Test *$class\::suite()
	115	{
	116	RegExTestSuite *suite = new $class;
	117
	118	EOT
	119	}
	120
	121	# output a test line (C++ interface)
	122	#
	123	sub write_test {
	124	my @args = @_;
	125	$_ = quotecxx for @args;
	126	print " suite->add(" . (join ', ', @args) . ", NULL);\n";
	127	}
	128
	129	# end a section (C++ interface)
	130	#
	131	sub end_section {
	132	my ($id, $class) = @{$classes[$#classes]};
	133
	134	print <<EOT;
	135
	136	return suite;
	137	}
	138
	139	CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
	140
	141	EOT
	142	}
	143
	144	# finish off the output (C++ interface)
	145	#
	146	sub end_output {
	147	print <<EOT;
	148
	149	/*
	150	* A suite containing all the above suites
	151	*/
	152
	153	class regextest : public TestSuite
	154	{
	155	public:
	156	regextest() : TestSuite("regex") { }
	157	static Test *suite();
	158	};
	159
	160	Test *regextest::suite()
	161	{
	162	TestSuite *suite = new regextest;
	163
	164	EOT
	165	print " suite->addTest(".$_->[1]."::suite());\n" for @classes;
	166
	167	print <<EOT;
	168
	169	return suite;
	170	}
	171
	172	CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
	173	CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
	174	EOT
	175	}
	176
	177	# Parse a tcl string. Handles curly quoting and double quoting.
	178	#
	179	sub parsetcl {
	180	my ($curly, $quote);
	181	# recursively defined expression that can parse balanced braces
	182	# warning: uses experimental features of perl, see perlop(1)
	183	$curly = qr/\{(?:(?>(?:\\[{}]\|[^{}])+)\|(??{$curly}))*\}/;
	184	$quote = qr/"(?:\\"\|[^"])*"/;
	185	my @tokens = shift =~ /($curly\|$quote\|\S+)/g;
	186
	187	# now remove braces/quotes and unescape any escapes
	188	for (@tokens) {
	189	if (s/^{(.*)}$/$1/) {
	190	# for curly quoting, only unescape \{ and \}
	191	s/\\([{}])/$1/g;
	192	} else {
	193	s/^"(.*)"$/$1/;
	194
	195	# unescape any escapes
	196	my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
	197	"n" => "\n", "r" => "\r", "t" => "\t",
	198	"v" => "\013" );
	199	my $x = qr/[[:xdigit:]]/;
	200
	201	s/\\([0-7]{1,3}\|x$x+\|u$x{1,4}\|.)/
	202	if ($1 =~ m{^([0-7]+)}) {
	203	chr(oct($1));
	204	} elsif ($1 =~ m{^x($x+)}) {
	205	pack("C0U", hex($1) & 0xff);
	206	} elsif ($1 =~ m{^u($x+)}) {
	207	pack("C0U", hex($1));
	208	} elsif ($esc{$1}) {
	209	$esc{$1};
	210	} else {
	211	$1;
	212	}
	213	/ge;
	214	}
	215	}
	216
	217	return @tokens;
	218	}
	219
	220	# helpers which keep track of whether begin_section has been called, so that
	221	# end_section can be called when appropriate
	222	#
	223	my @doing = ("0", "");
	224	my $in_section = 0;
	225
	226	sub handle_doing {
	227	end_section if $in_section;
	228	$in_section = 0;
	229	@doing = @_;
	230	}
	231
	232	sub handle_test {
	233	begin_section(@doing) if !$in_section;
	234	$in_section = 1;
	235	write_test @_;
	236	}
	237
	238	sub handle_end {
	239	end_section if $in_section;
	240	$in_section = 0;
	241	end_output;
	242	}
	243
	244	# 'main' - start by parsing the command lines options.
	245	#
	246	my $badoption = !@ARGV;
	247	my $utfdefault = $utf;
	248	my $outputname;
	249
	250	for (my $i = 0; $i < @ARGV; ) {
	251	if ($ARGV[$i] !~ m{^-.}) {
	252	$i++;
	253	next;
	254	}
	255
	256	if ($ARGV[$i] eq '--') {
	257	splice @ARGV, $i, 1;
	258	last;
	259	}
	260
	261	if ($ARGV[$i] =~ s{^-(.)o(.)$}{-$1}i) { # -o : output file
	262	$outputname = $2 \|\| splice @ARGV, $i + 1, 1;
	263	}
	264
	265	for (split //, substr($ARGV[$i], 1)) {
	266	if (/u/i) { # -u : utf-8 output
	267	$utf = 1;
	268	} elsif (/w/i) { # -w : wide char output
	269	$utf = 0;
	270	} else {
	271	$badoption = 1;
	272	}
	273	}
	274
	275	splice @ARGV, $i, 1;
	276	}
	277
	278	# Display help
	279	#
	280	if ($badoption) {
	281	my $prog = basename $0;
	282	my ($w, $u) = (" (default)", " ");
	283	($w, $u) = ($u, $w) if $utfdefault;
	284
	285	print <<EOT;
	286	Usage: $prog [-u\|-w] [-o OUTPUT] [FILE...]
	287	Generate test code for wxRegEx from 'reg.test'
	288	Example: $prog -o regex.inc reg.test wxreg.test
	289
	290	-w$w Output will be wide characters.
	291	-u$u Output will be UTF-8 encoded.
	292
	293	Input files should be in UTF-8. If no input files are specified input is
	294	read from stdin. If no output file is specified output is written to stdout.
	295	See the comments in reg.test for details of the input file format.
	296	EOT
	297	exit 0;
	298	}
	299
	300	# Open the output file
	301	#
	302	open STDOUT, ">$outputname" if $outputname;
	303
	304	# Read in the files and initially parse just the comments for copyright
	305	# information and instructions on the tests
	306	#
	307	my @input; # slurped input files stripped of comments
	308	my $files = ""; # copyright info from the input comments
	309	my $instructions = ""; # test instructions from the input comments
	310
	311	do {
	312	my $inputname = basename $ARGV[0] if @ARGV;
	313
	314	# slurp input
	315	undef $/;
	316	my $in = <>;
	317
	318	# remove escaped newlines
	319	$in =~ s/(?<!\\)\\\n//g;
	320
	321	# record the copyrights of the input files
	322	for ($in =~ /^#[\t ](.copyright.*)$/mig) {
	323	s/[\s:]+/ /g;
	324	$files .= " ";
	325	$files .= $inputname . ": " if $inputname && $inputname ne '-';
	326	$files .= "$_\n";
	327	}
	328
	329	# Parse the comments for instructions on the tests, which look like this:
	330	# i successful match with -indices (used in checking things like
	331	# nonparticipating subexpressions)
	332	if (!$instructions) {
	333	my $sp = qr{\t\| +}; # tab or three or more spaces
	334	my @instructions = $in =~
	335	/\n(
	336	(?:
	337	\#$sp\S?$sp\S[^\n]+\n # instruction line
	338	(?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any)
	339	)+
	340	)/gx;
	341
	342	if (@instructions) {
	343	$instructions[0] = "Test types:\n$instructions[0]";
	344	if (@instructions > 1) {
	345	$instructions[1] = "Flag characters:\n$instructions[1]";
	346	}
	347	$instructions = join "\n", @instructions;
	348	$instructions =~ s/^#([^\t]?)/ $1/mg;
	349	}
	350	}
	351
	352	# @input is the input of all files (stipped of comments)
	353	$in =~ s/^#.*$//mg;
	354	push @input, $in;
	355
	356	} while $ARGV[0];
	357
	358	# Make a string naming the generator, the input files and copyright info
	359	#
	360	my $from = "Generated " . localtime() . " by " . basename $0;
	361	$from =~ s/[\s]+/ /g;
	362	if ($files) {
	363	if ($files =~ /:/) {
	364	$from .= " from the following files:";
	365	} else {
	366	$from .= " from work with the following copyright:";
	367	}
	368	}
	369	$from = join("\n", $from =~ /(.{0,76}(?:\s\|$))/g); # word-wrap
	370	$from .= "\n$files" if $files;
	371
	372	# Now start to print the code
	373	#
	374	begin_output $from, $instructions;
	375
	376	# numbers for 'extra' sections
	377	my $extra = 1;
	378
	379	for (@input)
	380	{
	381	# Print the main tests
	382	#
	383	# Test lines look like this:
	384	# m 3 b {$a$b} ab ab a
	385	#
	386	# Also looks for heading lines, e.g.:
	387	# doing 4 "parentheses"
	388	#
	389	for (split "\n") {
	390	if (/^doing\s+(\S+)\s+(\S.*)/) {
	391	handle_doing parsetcl "$1 $2";
	392	} elsif (/^[efimp]\s/) {
	393	handle_test parsetcl $_;
	394	}
	395	}
	396
	397	# Extra tests
	398	#
	399	# The expression below matches something like this:
	400	# test reg-33.8 {Bug 505048} {
	401	# regexp -inline {\A\s[^b]b} ab
	402	# } ab
	403	#
	404	# The three subexpressions then return these parts:
	405	# $extras[$i] = '{Bug 505048}',
	406	# $extras[$i + 1] = '-inline {\A\s[^b]b} ab'
	407	# $extras[$i + 2] = 'ab'
	408	#
	409	my @extras = /\ntest\s+\S+\s(\{.?\})\s*\{\n # line 1
	410	\s*regexp\s+([^\n]+)\n # line 2
	411	\}\s(\S[^\n])/gx; # line 3
	412
	413	handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
	414
	415	for (my $i = 0; $i < @extras; $i += 3) {
	416	my $id = $extras[$i];
	417
	418	# further parse the middle line into options and the rest (i.e. $args)
	419	my ($opts, $args) = $extras[$i + 1] =~ /^\s((?:-\S+\s+))([^\s-].*)/;
	420
	421	my @args = parsetcl $args;
	422	$#args = 1; # only want the first two
	423
	424	# now handle the options
	425	my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
	426	my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
	427
	428	# get them all in the right order and print
	429	unshift @args, $test, parsetcl($id), $results ? '-' : 'o';
	430	push @args, parsetcl(parsetcl($results)) if $results;
	431	handle_test @args;
	432	}
	433	}
	434
	435	# finish
	436	#
	437	handle_end;