git.saurik.com Git - apt.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/perl -w
	2	#
	3	# Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org>
	4	#
	5	# Permission is hereby granted, free of charge, to any person obtaining a copy
	6	# of this software and associated documentation files (the "Software"), to deal
	7	# in the Software without restriction, including without limitation the rights
	8	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	9	# copies of the Software, and to permit persons to whom the Software is
	10	# furnished to do so, subject to the following conditions:
	11	#
	12	# The above copyright notice and this permission notice shall be included in
	13	# all copies or substantial portions of the Software.
	14	#
	15	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	16	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	17	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	18	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	19	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	20	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	21	# THE SOFTWARE.
	22
	23
	24	=head1 NAME
	25
	26	triehash - Generate a perfect hash function derived from a trie.
	27
	28	=cut
	29
	30	use strict;
	31	use warnings;
	32	use Getopt::Long;
	33
	34	=head1 SYNOPSIS
	35
	36	B<triehash> [S<I<option>>] [S<I<input file>>]
	37
	38	=head1 DESCRIPTION
	39
	40	triehash takes a list of words in input file and generates a function and
	41	an enumeration to describe the word
	42
	43	=head1 INPUT FILE FORMAT
	44
	45	The file consists of multiple lines of the form:
	46
	47	[label ~ ] word [= value]
	48
	49	This maps word to value, and generates an enumeration with entries of the form:
	50
	51	label = value
	52
	53	If I<label> is undefined, the word will be used, the minus character will be
	54	replaced by an underscore. If value is undefined it is counted upwards from
	55	the last value.
	56
	57	There may also be one line of the format
	58
	59	[ label ~] = value
	60
	61	Which defines the value to be used for non-existing keys. Note that this also
	62	changes default value for other keys, as for normal entries. So if you place
	63
	64	= 0
	65
	66	at the beginning of the file, unknown strings map to 0, and the other strings
	67	map to values starting with 1. If label is not specified, the default is
	68	I<Unknown>.
	69
	70	=head1 OPTIONS
	71
	72	=over 4
	73
	74	=item B<-C>I<.c file> B<--code>=I<.c file>
	75
	76	Generate code in the given file.
	77
	78	=item B<-H>I<header file> B<--header>=I<header file>
	79
	80	Generate a header in the given file, containing a declaration of the hash
	81	function and an enumeration.
	82
	83	=item B<--enum-name=>I<word>
	84
	85	The name of the enumeration.
	86
	87	=item B<--function-name=>I<word>
	88
	89	The name of the function.
	90
	91	=item B<--namespace=>I<name>
	92
	93	Put the function and enum into a namespace (C++)
	94
	95	=item B<--class=>I<name>
	96
	97	Put the function and enum into a class (C++)
	98
	99	=item B<--enum-class>
	100
	101	Generate an enum class instead of an enum (C++)
	102
	103	=item B<--counter-name=>I<name>
	104
	105	Use I<name> for a counter that is set to the latest entry in the enumeration
	106	+ 1. This can be useful for defining array sizes.
	107
	108	=item B<--extern-c>
	109
	110	Wrap everything into an extern "C" block. Not compatible with the C++
	111	options, as a header with namespaces, classes, or enum classes is not
	112	valid C.
	113
	114	=item B<--multi-byte>=I<value>
	115
	116	Generate code reading multiple bytes at once. The value is a string of power
	117	of twos to enable. The default value is 320 meaning that 8, 4, and single byte
	118	reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you
	119	also want to allow 2-byte reads. 2-byte reads are disabled by default because
	120	they negatively affect performance on older Intel architectures.
	121
	122	This generates code for both multiple bytes and single byte reads, but only
	123	enables the multiple byte reads of GNU C compatible compilers, as the following
	124	extensions are used:
	125
	126	=over 8
	127
	128	=item Byte-aligned integers
	129
	130	We must be able to generate integers that are aligned to a single byte using:
	131
	132	typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
	133
	134	=item Byte-order
	135
	136	The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined.
	137
	138	=back
	139
	140	We forcefully disable multi-byte reads on platforms where the variable
	141	I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined,
	142	as there is a measurable overhead from emulating the unaligned reads on
	143	ARM.
	144
	145	=item B<--language=>I<language>
	146
	147	Generate a file in the specified language. Currently known are 'C' and 'tree',
	148	the latter generating a tree.
	149
	150	=item B<--include=>I<header>
	151
	152	Add the header to the include statements of the header file. The value must
	153	be surrounded by quotes or angle brackets for C code. May be specified multiple
	154	times.
	155
	156	=back
	157
	158	=cut
	159
	160	my $unknown = -1;
	161	my $unknown_label = "Unknown";
	162	my $counter_start = 0;
	163	my $enum_name = "PerfectKey";
	164	my $function_name = "PerfectHash";
	165	my $enum_class = 0;
	166
	167	my $code_name = "-";
	168	my $header_name = "-";
	169	my $code;
	170	my $header;
	171	my $ignore_case = 0;
	172	my $multi_byte = "320";
	173	my $language = 'C';
	174	my $counter_name = undef;
	175	my @includes = ();
	176
	177
	178	Getopt::Long::config('default',
	179	'bundling',
	180	'no_getopt_compat',
	181	'no_auto_abbrev',
	182	'permute',
	183	'auto_help');
	184
	185	GetOptions ("code\|C=s" => \$code_name,
	186	"header\|H=s" => \$header_name,
	187	"function-name=s" => \$function_name,
	188	"ignore-case" => \$ignore_case,
	189	"enum-name=s" => \$enum_name,
	190	"language\|l=s" => \$language,
	191	"multi-byte=s" => \$multi_byte,
	192	"enum-class" => \$enum_class,
	193	"include=s" => \@includes,
	194	"counter-name=s" => \$counter_name)
	195	or die("Could not parse options!");
	196
	197
	198	package Trie {
	199
	200	sub new {
	201	my $class = shift;
	202	my $self = {};
	203	bless $self, $class;
	204
	205	$self->{children} = {};
	206	$self->{value} = undef;
	207	$self->{label} = undef;
	208
	209	return $self;
	210	}
	211
	212	# Return the largest power of 2 smaller or equal to the argument
	213	sub alignpower2 {
	214	my ($self, $length) = @_;
	215
	216	return 8 if ($length >= 8 && $multi_byte =~ /3/);
	217	return 4 if ($length >= 4 && $multi_byte =~ /2/);
	218	return 2 if ($length >= 2 && $multi_byte =~ /1/);
	219
	220	return 1;
	221	}
	222
	223	# Split the key into a head block and a tail
	224	sub split_key {
	225	my ($self, $key) = @_;
	226	my $length = length $key;
	227	my $split = $self->alignpower2($length);
	228
	229	return (substr($key, 0, $split), substr($key, $split));
	230	}
	231
	232	sub insert {
	233	my ($self, $key, $label, $value) = @_;
	234
	235	if (length($key) == 0) {
	236	$self->{label} = $label;
	237	$self->{value} = $value;
	238	return;
	239	}
	240
	241	my ($child, $tail) = $self->split_key($key);
	242
	243	$self->{children}{$child} = Trie->new if (!defined($self->{children}{$child}));
	244
	245	$self->{children}{$child}->insert($tail, $label, $value);
	246	}
	247
	248	sub filter_depth {
	249	my ($self, $togo) = @_;
	250
	251	my $new = Trie->new;
	252
	253	if ($togo != 0) {
	254	my $found = 0;
	255	foreach my $key (sort keys %{$self->{children}}) {
	256	if ($togo > length($key) \|\| defined $self->{children}{$key}->{value}) {
	257	my $child = $self->{children}{$key}->filter_depth($togo - length($key));
	258
	259	$new->{children}{$key}= $child if defined $child;
	260	$found = 1 if defined $child;
	261	}
	262	}
	263	return undef if (!$found);
	264	} else {
	265	$new->{value} = $self->{value};
	266	$new->{label} = $self->{label};
	267	}
	268
	269	return $new;
	270	}
	271
	272	# Reinsert all value nodes into the specified $trie, prepending $prefix
	273	# to their $paths.
	274	sub reinsert_value_nodes_into {
	275	my ($self, $trie, $prefix) = @_;
	276
	277	$trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value});
	278
	279	foreach my $key (sort keys %{$self->{children}}) {
	280	$self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key);
	281	}
	282	}
	283
	284	# Find an earlier split due a an ambiguous character
	285	sub find_ealier_split {
	286	my ($self, $key) = @_;
	287
	288	if ($ignore_case) {
	289	for my $i (0..length($key)-1) {
	290	# If the key starts with an ambiguous character, we need to
	291	# take only it. Otherwise, we need to take everything
	292	# before the character.
	293	return $self->alignpower2($i \|\| 1) if (main::ambiguous(substr($key, $i, 1)));
	294	}
	295	}
	296	return $self->alignpower2(length $key);
	297	}
	298
	299	# Rebuild the trie, splitting at ambigous chars, and unifying key lengths
	300	sub rebuild_tree {
	301	my $self = shift;
	302	# Determine if/where we need to split before an ambiguous character
	303	my $new_split = 99999999999999999;
	304	foreach my $key (sort keys %{$self->{children}}) {
	305	my $special_length = $self->find_ealier_split($key);
	306	$new_split = $special_length if ($special_length < $new_split);
	307	}
	308
	309	# Start building a new uniform trie
	310	my $newself = Trie->new;
	311	$newself->{label} = $self->{label};
	312	$newself->{value} = $self->{value};
	313	$newself->{children} = {};
	314
	315	foreach my $key (sort keys %{$self->{children}}) {
	316	my $head = substr($key, 0, $new_split);
	317	my $tail = substr($key, $new_split);
	318	# Rebuild the child node at $head, pushing $tail downwards
	319	$newself->{children}{$head} //= Trie->new;
	320	$self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail);
	321	# We took up to one special character of each key label. There might
	322	# be more, so we need to rebuild recursively.
	323	$newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree();
	324	}
	325
	326	return $newself;
	327	}
	328	}
	329
	330	# Code generator for C and C++
	331	package CCodeGen {
	332	my $static = ($code_name eq $header_name) ? "static" : "";
	333	my $enum_specifier = $enum_class ? "enum class" : "enum";
	334
	335	sub new {
	336	my $class = shift;
	337	my $self = {};
	338	bless $self, $class;
	339
	340	return $self;
	341	}
	342
	343	sub open_output {
	344	my $self = shift;
	345	if ($code_name ne "-") {
	346	open($code, '>', $code_name) or die "Cannot open $code_name: $!" ;
	347	} else {
	348	$code = *STDOUT;
	349	}
	350	if($code_name eq $header_name) {
	351	$header = $code;
	352	} elsif ($header_name ne "-") {
	353	open($header, '>', $header_name) or die "Cannot open $header_name: $!" ;
	354	} else {
	355	$header = *STDOUT;
	356	}
	357	}
	358
	359	sub word_to_label {
	360	my ($class, $word) = @_;
	361
	362	$word =~ s/_/__/g;
	363	$word =~ s/-/_/g;
	364	return $word;
	365	}
	366
	367	# Return a case label, by shifting and or-ing bytes in the word
	368	sub case_label {
	369	my ($self, $key) = @_;
	370
	371	return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte;
	372
	373	my $output = '0';
	374
	375	for my $i (0..length($key)-1) {
	376	$output .= sprintf("\| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key));
	377	}
	378
	379	return $output;
	380	}
	381
	382	# Return an appropriate read instruction for $length bytes from $offset
	383	sub switch_key {
	384	my ($self, $offset, $length) = @_;
	385
	386	return "string[$offset]" if $length == 1;
	387	return sprintf("((triehash_uu%s) &string[$offset])", $length * 8);
	388	}
	389
	390	sub print_table {
	391	my ($self, $trie, $fh, $indent, $index) = @_;
	392	$indent //= 0;
	393	$index //= 0;
	394
	395	if (defined $trie->{value}) {
	396	printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
	397	return;
	398	}
	399
	400	# The difference between lowercase and uppercase alphabetical characters
	401	# is that they have one bit flipped. If we have alphabetical characters
	402	# in the search space, and the entire search space works fine if we
	403	# always turn on the flip, just OR the character we are switching over
	404	# with the bit.
	405	my $want_use_bit = 0;
	406	my $can_use_bit = 1;
	407	my $key_length = 0;
	408	foreach my $key (sort keys %{$trie->{children}}) {
	409	$can_use_bit &= not main::ambiguous($key);
	410	$want_use_bit \|= ($key =~ /^[a-zA-Z]+$/);
	411	$key_length = length($key);
	412	}
	413
	414	if ($ignore_case && $can_use_bit && $want_use_bit) {
	415	printf $fh ((" " x $indent) . "switch(%s \| 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
	416	} else {
	417	printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
	418	}
	419
	420	my $notfirst = 0;
	421	foreach my $key (sort keys %{$trie->{children}}) {
	422	if ($notfirst) {
	423	printf $fh (" " x $indent . " break;\n");
	424	}
	425	if ($ignore_case) {
	426	printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key)));
	427	printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
	428	} else {
	429	printf $fh (" " x $indent . "case %s:\n", $self->case_label($key));
	430	}
	431
	432	$self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));
	433
	434	$notfirst=1;
	435	}
	436
	437	printf $fh (" " x $indent . "}\n");
	438	}
	439
	440	sub print_words {
	441	my ($self, $trie, $fh, $indent, $sofar) = @_;
	442
	443	$indent //= 0;
	444	$sofar //= "";
	445
	446
	447	printf $fh (" " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value};
	448
	449	foreach my $key (sort keys %{$trie->{children}}) {
	450	$self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key);
	451	}
	452	}
	453
	454	sub print_functions {
	455	my ($self, $trie, %lengths) = @_;
	456	foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
	457	print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n");
	458	print $code ("{\n");
	459	$self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1);
	460	printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
	461	print $code ("}\n");
	462	}
	463	}
	464
	465	sub main {
	466	my ($self, $trie, $num_values, %lengths) = @_;
	467	print $header ("#ifndef TRIE_HASH_${function_name}\n");
	468	print $header ("#define TRIE_HASH_${function_name}\n");
	469	print $header ("#include <stddef.h>\n");
	470	print $header ("#include <stdint.h>\n");
	471	foreach my $include (@includes) {
	472	print $header ("#include $include\n");
	473	}
	474	printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name));
	475	print $header ("${enum_specifier} ${enum_name} {\n");
	476	$self->print_words($trie, $header, 1);
	477	printf $header (" $unknown_label = $unknown,\n");
	478	print $header ("};\n");
	479	print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n");
	480
	481	print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name);
	482
	483	if ($multi_byte) {
	484	print $code ("#ifdef __GNUC__\n");
	485	for (my $i=16; $i <= 64; $i *= 2) {
	486	print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n");
	487	print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n");
	488	}
	489
	490	print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n");
	491	print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n");
	492	print $code ("#else\n");
	493	print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n");
	494	print $code ("#endif\n");
	495	print $code ("#if (!defined(__ARM_ARCH) \|\| defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n");
	496	print $code ("#define TRIE_HASH_MULTI_BYTE\n");
	497	print $code ("#endif\n");
	498	print $code ("#endif /GNUC /\n");
	499
	500	print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n");
	501	$self->print_functions($trie, %lengths);
	502	$multi_byte = 0;
	503	print $code ("#else\n");
	504	$self->print_functions($trie, %lengths);
	505	print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n");
	506	} else {
	507	$self->print_functions($trie, %lengths);
	508	}
	509
	510	print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n");
	511	print $code ("{\n");
	512	print $code (" switch (length) {\n");
	513	foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
	514	print $code (" case $local_length:\n");
	515	print $code (" return ${function_name}${local_length}(string);\n");
	516	}
	517	print $code (" default:\n");
	518	printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
	519	print $code (" }\n");
	520	print $code ("}\n");
	521
	522	# Print end of header here, in case header and code point to the same file
	523	print $header ("#endif /* TRIE_HASH_${function_name} */\n");
	524	}
	525	}
	526
	527	# Check if the word can be reached by exactly one word in (alphabet OR 0x20).
	528	sub ambiguous {
	529	my $word = shift;
	530
	531	foreach my $char (split //, $word) {
	532	# Setting the lowercase flag in the character produces a different
	533	# character, the character would thus not be matched.
	534	return 1 if ((ord($char) \| 0x20) != ord(lc($char)));
	535
	536	# A word is also ambiguous if any character in lowercase can be reached
	537	# by ORing 0x20 from another character in the charset that is not a
	538	# lowercase character of the current character.
	539	# Assume that we have UTF-8 and the most significant bit can be set
	540	for my $i (0..255) {
	541	return 1 if (($i \| 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char));
	542	}
	543	}
	544
	545	return 0;
	546	}
	547
	548	sub build_trie {
	549	my $codegen = shift;
	550	my $trie = Trie->new;
	551
	552	my $counter = $counter_start;
	553	my %lengths;
	554
	555	open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!";
	556	while (my $line = <$input>) {
	557	my ($label, $word, $value) = $line =~/\s(?:([^~\s]+)\s~)?(?:\s([^~=\s]+)\s)?(?:=\s([^\s]+)\s+)?\s/;
	558
	559	if (defined $word) {
	560	$counter = $value if defined($value);
	561	$label //= $codegen->word_to_label($word);
	562
	563	$trie->insert($word, $label, $counter);
	564	$lengths{length($word)} = 1;
	565	$counter++;
	566	} elsif (defined $value) {
	567	$unknown = $value;
	568	$unknown_label = $label if defined($label);
	569	$counter = $value + 1;
	570	} else {
	571	die "Invalid line: $line";
	572	}
	573	}
	574
	575	return ($trie, $counter, %lengths);
	576	}
	577
	578	# Generates an ASCII art tree
	579	package TreeCodeGen {
	580
	581	sub new {
	582	my $class = shift;
	583	my $self = {};
	584	bless $self, $class;
	585
	586	return $self;
	587	}
	588
	589	sub word_to_label {
	590	my ($self, $word) = @_;
	591	return $word;
	592	}
	593
	594	sub main {
	595	my ($self, $trie, $counter, %lengths) = @_;
	596	printf $code ("┌────────────────────────────────────────────────────┐\n");
	597	printf $code ("│ Initial trie │\n");
	598	printf $code ("└────────────────────────────────────────────────────┘\n");
	599	$self->print($trie);
	600	printf $code ("┌────────────────────────────────────────────────────┐\n");
	601	printf $code ("│ Rebuilt trie │\n");
	602	printf $code ("└────────────────────────────────────────────────────┘\n");
	603	$self->print($trie->rebuild_tree());
	604
	605	foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
	606	printf $code ("┌────────────────────────────────────────────────────┐\n");
	607	printf $code ("│ Trie for words of length %-4d │\n", $local_length);
	608	printf $code ("└────────────────────────────────────────────────────┘\n");
	609	$self->print($trie->filter_depth($local_length)->rebuild_tree());
	610	}
	611	}
	612
	613	sub open_output {
	614	my $self = shift;
	615	if ($code_name ne "-") {
	616	open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ;
	617	} else {
	618	$code = *STDOUT;
	619	}
	620	}
	621
	622	# Print a trie
	623	sub print {
	624	my ($self, $trie, $depth) = @_;
	625	$depth //= 0;
	626
	627	print $code (" → ") if defined($trie->{label});
	628	print $code ($trie->{label} // "", "\n");
	629	foreach my $key (sort keys %{$trie->{children}}) {
	630	print $code ("│ " x ($depth), "├── $key");
	631	$self->print($trie->{children}{$key}, $depth + 1);
	632	}
	633	}
	634	}
	635
	636	my %codegens = (
	637	C => "CCodeGen",
	638	tree => "TreeCodeGen",
	639	);
	640
	641
	642	defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens);
	643	my $codegen = $codegens{$language}->new();
	644	my ($trie, $counter, %lengths) = build_trie($codegen);
	645
	646	$codegen->open_output();
	647	$codegen->main($trie, $counter, %lengths);
	648
	649
	650	=head1 LICENSE
	651
	652	triehash is available under the MIT/Expat license, see the source code
	653	for more information.
	654
	655	=head1 AUTHOR
	656
	657	Julian Andres Klode <jak@jak-linux.org>
	658
	659	=cut
	660