git.saurik.com Git - wxWidgets.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env perl
	2	#############################################################################
	3	# Name: regex.pl
	4	# Purpose: Generate test code for wxRegEx from 'reg.test'
	5	# Author: Mike Wetherell
	6	# Copyright: (c) Mike Wetherell
	7	# Licence: wxWindows licence
	8	#############################################################################
	9
	10	#
	11	# Notes:
	12	# See './regex.pl -h' for usage
	13	#
	14	# Output at the moment is C++ using the cppunit testing framework. The
	15	# language/framework specifics are separated, with the following 5
	16	# subs as an interface: 'begin_output', 'begin_section', 'write_test',
	17	# 'end_section' and 'end_output'. So for a different language/framework,
	18	# implement 5 new similar subs.
	19	#
	20	# I've avoided using 'use encoding "UTF-8"', since this wasn't available
	21	# in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
	22	# earler than perl 5.6.0 aren't going to work.
	23	#
	24
	25	use strict;
	26	use warnings;
	27	use File::Basename;
	28	#use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory
	29
	30	# if 0 output is wide characters, if 1 output is utf8 encoded
	31	my $utf = 1;
	32
	33	# quote a parameter (C++ helper)
	34	#
	35	sub quotecxx {
	36	my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
	37	"\n" => "n", "\r" => "r", "\t" => "t",
	38	"\013" => "v", '"' => '"', "\\" => "\\" );
	39
	40	# working around lack of 'use encoding'
	41	if (!$utf) {
	42	$_ = pack "U0C", unpack "C", $_;
	43	use utf8;
	44	}
	45
	46	s/[\000-\037"\\\177-\x{ffff}]/
	47	if ($esc{$&}) {
	48	"\\$esc{$&}";
	49	} elsif (ord($&) > 0x9f && !$utf) {
	50	sprintf "\\u%04x", ord($&);
	51	} else {
	52	sprintf "\\%03o", ord($&);
	53	}
	54	/ge;
	55
	56	# working around lack of 'use encoding'
	57	if (!$utf) {
	58	no utf8;
	59	$_ = pack "C", unpack "C", $_;
	60	}
	61
	62	return ($utf ? '"' : 'L"') . $_ . '"'
	63	}
	64
	65	# start writing the output code (C++ interface)
	66	#
	67	sub begin_output {
	68	my ($from, $instructions) = @_;
	69
	70	# embed it in the comment
	71	$from = "\n$from";
	72	$from =~ s/^(?: )?/ * /mg;
	73
	74	# $instructions contains information about the flags etc.
	75	if ($instructions) {
	76	$instructions = "\n$instructions";
	77	$instructions =~ s/^(?: )?/ * /mg;
	78	}
	79
	80	my $u = $utf ? " (UTF-8 encoded)" : "";
	81
	82	print <<EOT;
	83	/*
	84	* Test data for wxRegEx$u
	85	$from$instructions */
	86
	87	EOT
	88	}
	89
	90	my @classes;
	91
	92	# start a new section (C++ interface)
	93	#
	94	sub begin_section {
	95	my ($id, $title) = @_;
	96	my $class = "regextest_$id";
	97	$class =~ s/\W/_/g;
	98	push @classes, [$id, $class];
	99
	100	print <<EOT;
	101
	102	/*
	103	* $id $title
	104	*/
	105
	106	class $class : public RegExTestSuite
	107	{
	108	public:
	109	$class() : RegExTestSuite("regex.$id") { }
	110	static Test *suite();
	111	};
	112
	113	Test *$class\::suite()
	114	{
	115	RegExTestSuite *suite = new $class;
	116
	117	EOT
	118	}
	119
	120	# output a test line (C++ interface)
	121	#
	122	sub write_test {
	123	my @args = @_;
	124	$_ = quotecxx for @args;
	125	print " suite->add(" . (join ', ', @args) . ", NULL);\n";
	126	}
	127
	128	# end a section (C++ interface)
	129	#
	130	sub end_section {
	131	my ($id, $class) = @{$classes[$#classes]};
	132
	133	print <<EOT;
	134
	135	return suite;
	136	}
	137
	138	CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
	139
	140	EOT
	141	}
	142
	143	# finish off the output (C++ interface)
	144	#
	145	sub end_output {
	146	print <<EOT;
	147
	148	/*
	149	* A suite containing all the above suites
	150	*/
	151
	152	class regextest : public TestSuite
	153	{
	154	public:
	155	regextest() : TestSuite("regex") { }
	156	static Test *suite();
	157	};
	158
	159	Test *regextest::suite()
	160	{
	161	TestSuite *suite = new regextest;
	162
	163	EOT
	164	print " suite->addTest(".$_->[1]."::suite());\n" for @classes;
	165
	166	print <<EOT;
	167
	168	return suite;
	169	}
	170
	171	CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
	172	CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
	173	EOT
	174	}
	175
	176	# Parse a tcl string. Handles curly quoting and double quoting.
	177	#
	178	sub parsetcl {
	179	my ($curly, $quote);
	180	# recursively defined expression that can parse balanced braces
	181	# warning: uses experimental features of perl, see perlop(1)
	182	$curly = qr/\{(?:(?>(?:\\[{}]\|[^{}])+)\|(??{$curly}))*\}/;
	183	$quote = qr/"(?:\\"\|[^"])*"/;
	184	my @tokens = shift =~ /($curly\|$quote\|\S+)/g;
	185
	186	# now remove braces/quotes and unescape any escapes
	187	for (@tokens) {
	188	if (s/^{(.*)}$/$1/) {
	189	# for curly quoting, only unescape \{ and \}
	190	s/\\([{}])/$1/g;
	191	} else {
	192	s/^"(.*)"$/$1/;
	193
	194	# unescape any escapes
	195	my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
	196	"n" => "\n", "r" => "\r", "t" => "\t",
	197	"v" => "\013" );
	198	my $x = qr/[[:xdigit:]]/;
	199
	200	s/\\([0-7]{1,3}\|x$x+\|u$x{1,4}\|.)/
	201	if ($1 =~ m{^([0-7]+)}) {
	202	chr(oct($1));
	203	} elsif ($1 =~ m{^x($x+)}) {
	204	pack("C0U", hex($1) & 0xff);
	205	} elsif ($1 =~ m{^u($x+)}) {
	206	pack("C0U", hex($1));
	207	} elsif ($esc{$1}) {
	208	$esc{$1};
	209	} else {
	210	$1;
	211	}
	212	/ge;
	213	}
	214	}
	215
	216	return @tokens;
	217	}
	218
	219	# helpers which keep track of whether begin_section has been called, so that
	220	# end_section can be called when appropriate
	221	#
	222	my @doing = ("0", "");
	223	my $in_section = 0;
	224
	225	sub handle_doing {
	226	end_section if $in_section;
	227	$in_section = 0;
	228	@doing = @_;
	229	}
	230
	231	sub handle_test {
	232	begin_section(@doing) if !$in_section;
	233	$in_section = 1;
	234	write_test @_;
	235	}
	236
	237	sub handle_end {
	238	end_section if $in_section;
	239	$in_section = 0;
	240	end_output;
	241	}
	242
	243	# 'main' - start by parsing the command lines options.
	244	#
	245	my $badoption = !@ARGV;
	246	my $utfdefault = $utf;
	247	my $outputname;
	248
	249	for (my $i = 0; $i < @ARGV; ) {
	250	if ($ARGV[$i] !~ m{^-.}) {
	251	$i++;
	252	next;
	253	}
	254
	255	if ($ARGV[$i] eq '--') {
	256	splice @ARGV, $i, 1;
	257	last;
	258	}
	259
	260	if ($ARGV[$i] =~ s{^-(.)o(.)$}{-$1}i) { # -o : output file
	261	$outputname = $2 \|\| splice @ARGV, $i + 1, 1;
	262	}
	263
	264	for (split //, substr($ARGV[$i], 1)) {
	265	if (/u/i) { # -u : utf-8 output
	266	$utf = 1;
	267	} elsif (/w/i) { # -w : wide char output
	268	$utf = 0;
	269	} else {
	270	$badoption = 1;
	271	}
	272	}
	273
	274	splice @ARGV, $i, 1;
	275	}
	276
	277	# Display help
	278	#
	279	if ($badoption) {
	280	my $prog = basename $0;
	281	my ($w, $u) = (" (default)", " ");
	282	($w, $u) = ($u, $w) if $utfdefault;
	283
	284	print <<EOT;
	285	Usage: $prog [-u\|-w] [-o OUTPUT] [FILE...]
	286	Generate test code for wxRegEx from 'reg.test'
	287	Example: $prog -o regex.inc reg.test wxreg.test
	288
	289	-w$w Output will be wide characters.
	290	-u$u Output will be UTF-8 encoded.
	291
	292	Input files should be in UTF-8. If no input files are specified input is
	293	read from stdin. If no output file is specified output is written to stdout.
	294	See the comments in reg.test for details of the input file format.
	295	EOT
	296	exit 0;
	297	}
	298
	299	# Open the output file
	300	#
	301	open STDOUT, ">$outputname" if $outputname;
	302
	303	# Read in the files and initially parse just the comments for copyright
	304	# information and instructions on the tests
	305	#
	306	my @input; # slurped input files stripped of comments
	307	my $files = ""; # copyright info from the input comments
	308	my $instructions = ""; # test instructions from the input comments
	309
	310	do {
	311	my $inputname = basename $ARGV[0] if @ARGV;
	312
	313	# slurp input
	314	undef $/;
	315	my $in = <>;
	316
	317	# remove escaped newlines
	318	$in =~ s/(?<!\\)\\\n//g;
	319
	320	# record the copyrights of the input files
	321	for ($in =~ /^#[\t ](.copyright.*)$/mig) {
	322	s/[\s:]+/ /g;
	323	$files .= " ";
	324	$files .= $inputname . ": " if $inputname && $inputname ne '-';
	325	$files .= "$_\n";
	326	}
	327
	328	# Parse the comments for instructions on the tests, which look like this:
	329	# i successful match with -indices (used in checking things like
	330	# nonparticipating subexpressions)
	331	if (!$instructions) {
	332	my $sp = qr{\t\| +}; # tab or three or more spaces
	333	my @instructions = $in =~
	334	/\n(
	335	(?:
	336	\#$sp\S?$sp\S[^\n]+\n # instruction line
	337	(?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any)
	338	)+
	339	)/gx;
	340
	341	if (@instructions) {
	342	$instructions[0] = "Test types:\n$instructions[0]";
	343	if (@instructions > 1) {
	344	$instructions[1] = "Flag characters:\n$instructions[1]";
	345	}
	346	$instructions = join "\n", @instructions;
	347	$instructions =~ s/^#([^\t]?)/ $1/mg;
	348	}
	349	}
	350
	351	# @input is the input of all files (stipped of comments)
	352	$in =~ s/^#.*$//mg;
	353	push @input, $in;
	354
	355	} while $ARGV[0];
	356
	357	# Make a string naming the generator, the input files and copyright info
	358	#
	359	my $from = "Generated " . localtime() . " by " . basename $0;
	360	$from =~ s/[\s]+/ /g;
	361	if ($files) {
	362	if ($files =~ /:/) {
	363	$from .= " from the following files:";
	364	} else {
	365	$from .= " from work with the following copyright:";
	366	}
	367	}
	368	$from = join("\n", $from =~ /(.{0,76}(?:\s\|$))/g); # word-wrap
	369	$from .= "\n$files" if $files;
	370
	371	# Now start to print the code
	372	#
	373	begin_output $from, $instructions;
	374
	375	# numbers for 'extra' sections
	376	my $extra = 1;
	377
	378	for (@input)
	379	{
	380	# Print the main tests
	381	#
	382	# Test lines look like this:
	383	# m 3 b {$a$b} ab ab a
	384	#
	385	# Also looks for heading lines, e.g.:
	386	# doing 4 "parentheses"
	387	#
	388	for (split "\n") {
	389	if (/^doing\s+(\S+)\s+(\S.*)/) {
	390	handle_doing parsetcl "$1 $2";
	391	} elsif (/^[efimp]\s/) {
	392	handle_test parsetcl $_;
	393	}
	394	}
	395
	396	# Extra tests
	397	#
	398	# The expression below matches something like this:
	399	# test reg-33.8 {Bug 505048} {
	400	# regexp -inline {\A\s[^b]b} ab
	401	# } ab
	402	#
	403	# The three subexpressions then return these parts:
	404	# $extras[$i] = '{Bug 505048}',
	405	# $extras[$i + 1] = '-inline {\A\s[^b]b} ab'
	406	# $extras[$i + 2] = 'ab'
	407	#
	408	my @extras = /\ntest\s+\S+\s(\{.?\})\s*\{\n # line 1
	409	\s*regexp\s+([^\n]+)\n # line 2
	410	\}\s(\S[^\n])/gx; # line 3
	411
	412	handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
	413
	414	for (my $i = 0; $i < @extras; $i += 3) {
	415	my $id = $extras[$i];
	416
	417	# further parse the middle line into options and the rest (i.e. $args)
	418	my ($opts, $args) = $extras[$i + 1] =~ /^\s((?:-\S+\s+))([^\s-].*)/;
	419
	420	my @args = parsetcl $args;
	421	$#args = 1; # only want the first two
	422
	423	# now handle the options
	424	my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
	425	my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
	426
	427	# get them all in the right order and print
	428	unshift @args, $test, parsetcl($id), $results ? '-' : 'o';
	429	push @args, parsetcl(parsetcl($results)) if $results;
	430	handle_test @args;
	431	}
	432	}
	433
	434	# finish
	435	#
	436	handle_end;