]> git.saurik.com Git - apt.git/blame_incremental - triehash/triehash.pl
The entire concept of PendingError() is flawed :/.
[apt.git] / triehash / triehash.pl
... / ...
CommitLineData
1#!/usr/bin/perl -w
2#
3# Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org>
4#
5# Permission is hereby granted, free of charge, to any person obtaining a copy
6# of this software and associated documentation files (the "Software"), to deal
7# in the Software without restriction, including without limitation the rights
8# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9# copies of the Software, and to permit persons to whom the Software is
10# furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice shall be included in
13# all copies or substantial portions of the Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21# THE SOFTWARE.
22
23
24=head1 NAME
25
26triehash - Generate a perfect hash function derived from a trie.
27
28=cut
29
30use strict;
31use warnings;
32use Getopt::Long;
33
34=head1 SYNOPSIS
35
36B<triehash> [S<I<option>>] [S<I<input file>>]
37
38=head1 DESCRIPTION
39
40triehash takes a list of words in input file and generates a function and
41an enumeration to describe the word
42
43=head1 INPUT FILE FORMAT
44
45The file consists of multiple lines of the form:
46
47 [label ~ ] word [= value]
48
49This maps word to value, and generates an enumeration with entries of the form:
50
51 label = value
52
53If I<label> is undefined, the word will be used, the minus character will be
54replaced by an underscore. If value is undefined it is counted upwards from
55the last value.
56
57There may also be one line of the format
58
59 [ label ~] = value
60
61Which defines the value to be used for non-existing keys. Note that this also
62changes default value for other keys, as for normal entries. So if you place
63
64 = 0
65
66at the beginning of the file, unknown strings map to 0, and the other strings
67map to values starting with 1. If label is not specified, the default is
68I<Unknown>.
69
70=head1 OPTIONS
71
72=over 4
73
74=item B<-C>I<.c file> B<--code>=I<.c file>
75
76Generate code in the given file.
77
78=item B<-H>I<header file> B<--header>=I<header file>
79
80Generate a header in the given file, containing a declaration of the hash
81function and an enumeration.
82
83=item B<--enum-name=>I<word>
84
85The name of the enumeration.
86
87=item B<--function-name=>I<word>
88
89The name of the function.
90
91=item B<--namespace=>I<name>
92
93Put the function and enum into a namespace (C++)
94
95=item B<--class=>I<name>
96
97Put the function and enum into a class (C++)
98
99=item B<--enum-class>
100
101Generate an enum class instead of an enum (C++)
102
103=item B<--counter-name=>I<name>
104
105Use I<name> for a counter that is set to the latest entry in the enumeration
106+ 1. This can be useful for defining array sizes.
107
108=item B<--extern-c>
109
110Wrap everything into an extern "C" block. Not compatible with the C++
111options, as a header with namespaces, classes, or enum classes is not
112valid C.
113
114=item B<--multi-byte>=I<value>
115
116Generate code reading multiple bytes at once. The value is a string of power
117of twos to enable. The default value is 320 meaning that 8, 4, and single byte
118reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you
119also want to allow 2-byte reads. 2-byte reads are disabled by default because
120they negatively affect performance on older Intel architectures.
121
122This generates code for both multiple bytes and single byte reads, but only
123enables the multiple byte reads of GNU C compatible compilers, as the following
124extensions are used:
125
126=over 8
127
128=item Byte-aligned integers
129
130We must be able to generate integers that are aligned to a single byte using:
131
132 typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
133
134=item Byte-order
135
136The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined.
137
138=back
139
140We forcefully disable multi-byte reads on platforms where the variable
141I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined,
142as there is a measurable overhead from emulating the unaligned reads on
143ARM.
144
145=item B<--language=>I<language>
146
147Generate a file in the specified language. Currently known are 'C' and 'tree',
148the latter generating a tree.
149
150=item B<--include=>I<header>
151
152Add the header to the include statements of the header file. The value must
153be surrounded by quotes or angle brackets for C code. May be specified multiple
154times.
155
156=back
157
158=cut
159
160my $unknown = -1;
161my $unknown_label = "Unknown";
162my $counter_start = 0;
163my $enum_name = "PerfectKey";
164my $function_name = "PerfectHash";
165my $enum_class = 0;
166
167my $code_name = "-";
168my $header_name = "-";
169my $code;
170my $header;
171my $ignore_case = 0;
172my $multi_byte = "320";
173my $language = 'C';
174my $counter_name = undef;
175my @includes = ();
176
177
178Getopt::Long::config('default',
179 'bundling',
180 'no_getopt_compat',
181 'no_auto_abbrev',
182 'permute',
183 'auto_help');
184
185GetOptions ("code|C=s" => \$code_name,
186 "header|H=s" => \$header_name,
187 "function-name=s" => \$function_name,
188 "ignore-case" => \$ignore_case,
189 "enum-name=s" => \$enum_name,
190 "language|l=s" => \$language,
191 "multi-byte=s" => \$multi_byte,
192 "enum-class" => \$enum_class,
193 "include=s" => \@includes,
194 "counter-name=s" => \$counter_name)
195 or die("Could not parse options!");
196
197
198package Trie {
199
200 sub new {
201 my $class = shift;
202 my $self = {};
203 bless $self, $class;
204
205 $self->{children} = {};
206 $self->{value} = undef;
207 $self->{label} = undef;
208
209 return $self;
210 }
211
212 # Return the largest power of 2 smaller or equal to the argument
213 sub alignpower2 {
214 my ($self, $length) = @_;
215
216 return 8 if ($length >= 8 && $multi_byte =~ /3/);
217 return 4 if ($length >= 4 && $multi_byte =~ /2/);
218 return 2 if ($length >= 2 && $multi_byte =~ /1/);
219
220 return 1;
221 }
222
223 # Split the key into a head block and a tail
224 sub split_key {
225 my ($self, $key) = @_;
226 my $length = length $key;
227 my $split = $self->alignpower2($length);
228
229 return (substr($key, 0, $split), substr($key, $split));
230 }
231
232 sub insert {
233 my ($self, $key, $label, $value) = @_;
234
235 if (length($key) == 0) {
236 $self->{label} = $label;
237 $self->{value} = $value;
238 return;
239 }
240
241 my ($child, $tail) = $self->split_key($key);
242
243 $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child}));
244
245 $self->{children}{$child}->insert($tail, $label, $value);
246 }
247
248 sub filter_depth {
249 my ($self, $togo) = @_;
250
251 my $new = Trie->new;
252
253 if ($togo != 0) {
254 my $found = 0;
255 foreach my $key (sort keys %{$self->{children}}) {
256 if ($togo > length($key) || defined $self->{children}{$key}->{value}) {
257 my $child = $self->{children}{$key}->filter_depth($togo - length($key));
258
259 $new->{children}{$key}= $child if defined $child;
260 $found = 1 if defined $child;
261 }
262 }
263 return undef if (!$found);
264 } else {
265 $new->{value} = $self->{value};
266 $new->{label} = $self->{label};
267 }
268
269 return $new;
270 }
271
272 # Reinsert all value nodes into the specified $trie, prepending $prefix
273 # to their $paths.
274 sub reinsert_value_nodes_into {
275 my ($self, $trie, $prefix) = @_;
276
277 $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value});
278
279 foreach my $key (sort keys %{$self->{children}}) {
280 $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key);
281 }
282 }
283
284 # Find an earlier split due a an ambiguous character
285 sub find_ealier_split {
286 my ($self, $key) = @_;
287
288 if ($ignore_case) {
289 for my $i (0..length($key)-1) {
290 # If the key starts with an ambiguous character, we need to
291 # take only it. Otherwise, we need to take everything
292 # before the character.
293 return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1)));
294 }
295 }
296 return $self->alignpower2(length $key);
297 }
298
299 # Rebuild the trie, splitting at ambigous chars, and unifying key lengths
300 sub rebuild_tree {
301 my $self = shift;
302 # Determine if/where we need to split before an ambiguous character
303 my $new_split = 99999999999999999;
304 foreach my $key (sort keys %{$self->{children}}) {
305 my $special_length = $self->find_ealier_split($key);
306 $new_split = $special_length if ($special_length < $new_split);
307 }
308
309 # Start building a new uniform trie
310 my $newself = Trie->new;
311 $newself->{label} = $self->{label};
312 $newself->{value} = $self->{value};
313 $newself->{children} = {};
314
315 foreach my $key (sort keys %{$self->{children}}) {
316 my $head = substr($key, 0, $new_split);
317 my $tail = substr($key, $new_split);
318 # Rebuild the child node at $head, pushing $tail downwards
319 $newself->{children}{$head} //= Trie->new;
320 $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail);
321 # We took up to one special character of each key label. There might
322 # be more, so we need to rebuild recursively.
323 $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree();
324 }
325
326 return $newself;
327 }
328}
329
330# Code generator for C and C++
331package CCodeGen {
332 my $static = ($code_name eq $header_name) ? "static" : "";
333 my $enum_specifier = $enum_class ? "enum class" : "enum";
334
335 sub new {
336 my $class = shift;
337 my $self = {};
338 bless $self, $class;
339
340 return $self;
341 }
342
343 sub open_output {
344 my $self = shift;
345 if ($code_name ne "-") {
346 open($code, '>', $code_name) or die "Cannot open $code_name: $!" ;
347 } else {
348 $code = *STDOUT;
349 }
350 if($code_name eq $header_name) {
351 $header = $code;
352 } elsif ($header_name ne "-") {
353 open($header, '>', $header_name) or die "Cannot open $header_name: $!" ;
354 } else {
355 $header = *STDOUT;
356 }
357 }
358
359 sub word_to_label {
360 my ($class, $word) = @_;
361
362 $word =~ s/_/__/g;
363 $word =~ s/-/_/g;
364 return $word;
365 }
366
367 # Return a case label, by shifting and or-ing bytes in the word
368 sub case_label {
369 my ($self, $key) = @_;
370
371 return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte;
372
373 my $output = '0';
374
375 for my $i (0..length($key)-1) {
376 $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key));
377 }
378
379 return $output;
380 }
381
382 # Return an appropriate read instruction for $length bytes from $offset
383 sub switch_key {
384 my ($self, $offset, $length) = @_;
385
386 return "string[$offset]" if $length == 1;
387 return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8);
388 }
389
390 sub print_table {
391 my ($self, $trie, $fh, $indent, $index) = @_;
392 $indent //= 0;
393 $index //= 0;
394
395 if (defined $trie->{value}) {
396 printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
397 return;
398 }
399
400 # The difference between lowercase and uppercase alphabetical characters
401 # is that they have one bit flipped. If we have alphabetical characters
402 # in the search space, and the entire search space works fine if we
403 # always turn on the flip, just OR the character we are switching over
404 # with the bit.
405 my $want_use_bit = 0;
406 my $can_use_bit = 1;
407 my $key_length = 0;
408 foreach my $key (sort keys %{$trie->{children}}) {
409 $can_use_bit &= not main::ambiguous($key);
410 $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
411 $key_length = length($key);
412 }
413
414 if ($ignore_case && $can_use_bit && $want_use_bit) {
415 printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
416 } else {
417 printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
418 }
419
420 my $notfirst = 0;
421 foreach my $key (sort keys %{$trie->{children}}) {
422 if ($notfirst) {
423 printf $fh (" " x $indent . " break;\n");
424 }
425 if ($ignore_case) {
426 printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key)));
427 printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
428 } else {
429 printf $fh (" " x $indent . "case %s:\n", $self->case_label($key));
430 }
431
432 $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));
433
434 $notfirst=1;
435 }
436
437 printf $fh (" " x $indent . "}\n");
438 }
439
440 sub print_words {
441 my ($self, $trie, $fh, $indent, $sofar) = @_;
442
443 $indent //= 0;
444 $sofar //= "";
445
446
447 printf $fh (" " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value};
448
449 foreach my $key (sort keys %{$trie->{children}}) {
450 $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key);
451 }
452 }
453
454 sub print_functions {
455 my ($self, $trie, %lengths) = @_;
456 foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
457 print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n");
458 print $code ("{\n");
459 $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1);
460 printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
461 print $code ("}\n");
462 }
463 }
464
465 sub main {
466 my ($self, $trie, $num_values, %lengths) = @_;
467 print $header ("#ifndef TRIE_HASH_${function_name}\n");
468 print $header ("#define TRIE_HASH_${function_name}\n");
469 print $header ("#include <stddef.h>\n");
470 print $header ("#include <stdint.h>\n");
471 foreach my $include (@includes) {
472 print $header ("#include $include\n");
473 }
474 printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name));
475 print $header ("${enum_specifier} ${enum_name} {\n");
476 $self->print_words($trie, $header, 1);
477 printf $header (" $unknown_label = $unknown,\n");
478 print $header ("};\n");
479 print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n");
480
481 print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name);
482
483 if ($multi_byte) {
484 print $code ("#ifdef __GNUC__\n");
485 for (my $i=16; $i <= 64; $i *= 2) {
486 print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n");
487 print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n");
488 }
489
490 print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n");
491 print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n");
492 print $code ("#else\n");
493 print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n");
494 print $code ("#endif\n");
495 print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n");
496 print $code ("#define TRIE_HASH_MULTI_BYTE\n");
497 print $code ("#endif\n");
498 print $code ("#endif /*GNUC */\n");
499
500 print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n");
501 $self->print_functions($trie, %lengths);
502 $multi_byte = 0;
503 print $code ("#else\n");
504 $self->print_functions($trie, %lengths);
505 print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n");
506 } else {
507 $self->print_functions($trie, %lengths);
508 }
509
510 print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n");
511 print $code ("{\n");
512 print $code (" switch (length) {\n");
513 foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
514 print $code (" case $local_length:\n");
515 print $code (" return ${function_name}${local_length}(string);\n");
516 }
517 print $code (" default:\n");
518 printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
519 print $code (" }\n");
520 print $code ("}\n");
521
522 # Print end of header here, in case header and code point to the same file
523 print $header ("#endif /* TRIE_HASH_${function_name} */\n");
524 }
525}
526
527# Check if the word can be reached by exactly one word in (alphabet OR 0x20).
528sub ambiguous {
529 my $word = shift;
530
531 foreach my $char (split //, $word) {
532 # Setting the lowercase flag in the character produces a different
533 # character, the character would thus not be matched.
534 return 1 if ((ord($char) | 0x20) != ord(lc($char)));
535
536 # A word is also ambiguous if any character in lowercase can be reached
537 # by ORing 0x20 from another character in the charset that is not a
538 # lowercase character of the current character.
539 # Assume that we have UTF-8 and the most significant bit can be set
540 for my $i (0..255) {
541 return 1 if (($i | 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char));
542 }
543 }
544
545 return 0;
546}
547
548sub build_trie {
549 my $codegen = shift;
550 my $trie = Trie->new;
551
552 my $counter = $counter_start;
553 my %lengths;
554
555 open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!";
556 while (my $line = <$input>) {
557 my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/;
558
559 if (defined $word) {
560 $counter = $value if defined($value);
561 $label //= $codegen->word_to_label($word);
562
563 $trie->insert($word, $label, $counter);
564 $lengths{length($word)} = 1;
565 $counter++;
566 } elsif (defined $value) {
567 $unknown = $value;
568 $unknown_label = $label if defined($label);
569 $counter = $value + 1;
570 } else {
571 die "Invalid line: $line";
572 }
573 }
574
575 return ($trie, $counter, %lengths);
576}
577
578# Generates an ASCII art tree
579package TreeCodeGen {
580
581 sub new {
582 my $class = shift;
583 my $self = {};
584 bless $self, $class;
585
586 return $self;
587 }
588
589 sub word_to_label {
590 my ($self, $word) = @_;
591 return $word;
592 }
593
594 sub main {
595 my ($self, $trie, $counter, %lengths) = @_;
596 printf $code ("┌────────────────────────────────────────────────────┐\n");
597 printf $code ("│ Initial trie │\n");
598 printf $code ("└────────────────────────────────────────────────────┘\n");
599 $self->print($trie);
600 printf $code ("┌────────────────────────────────────────────────────┐\n");
601 printf $code ("│ Rebuilt trie │\n");
602 printf $code ("└────────────────────────────────────────────────────┘\n");
603 $self->print($trie->rebuild_tree());
604
605 foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
606 printf $code ("┌────────────────────────────────────────────────────┐\n");
607 printf $code ("│ Trie for words of length %-4d │\n", $local_length);
608 printf $code ("└────────────────────────────────────────────────────┘\n");
609 $self->print($trie->filter_depth($local_length)->rebuild_tree());
610 }
611 }
612
613 sub open_output {
614 my $self = shift;
615 if ($code_name ne "-") {
616 open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ;
617 } else {
618 $code = *STDOUT;
619 }
620 }
621
622 # Print a trie
623 sub print {
624 my ($self, $trie, $depth) = @_;
625 $depth //= 0;
626
627 print $code (" → ") if defined($trie->{label});
628 print $code ($trie->{label} // "", "\n");
629 foreach my $key (sort keys %{$trie->{children}}) {
630 print $code ("│ " x ($depth), "├── $key");
631 $self->print($trie->{children}{$key}, $depth + 1);
632 }
633 }
634}
635
636my %codegens = (
637 C => "CCodeGen",
638 tree => "TreeCodeGen",
639);
640
641
642defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens);
643my $codegen = $codegens{$language}->new();
644my ($trie, $counter, %lengths) = build_trie($codegen);
645
646$codegen->open_output();
647$codegen->main($trie, $counter, %lengths);
648
649
650=head1 LICENSE
651
652triehash is available under the MIT/Expat license, see the source code
653for more information.
654
655=head1 AUTHOR
656
657Julian Andres Klode <jak@jak-linux.org>
658
659=cut
660