3 # Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org>
5 # Permission is hereby granted, free of charge, to any person obtaining a copy
6 # of this software and associated documentation files (the "Software"), to deal
7 # in the Software without restriction, including without limitation the rights
8 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 # copies of the Software, and to permit persons to whom the Software is
10 # furnished to do so, subject to the following conditions:
12 # The above copyright notice and this permission notice shall be included in
13 # all copies or substantial portions of the Software.
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 triehash - Generate a perfect hash function derived from a trie.
36 B<triehash> [S<I<option>>] [S<I<input file>>]
40 triehash takes a list of words in input file and generates a function and
41 an enumeration to describe the word
43 =head1 INPUT FILE FORMAT
45 The file consists of multiple lines of the form:
47 [label ~ ] word [= value]
49 This maps word to value, and generates an enumeration with entries of the form:
53 If I<label> is undefined, the word will be used, the minus character will be
54 replaced by an underscore. If value is undefined it is counted upwards from
57 There may also be one line of the format
61 Which defines the value to be used for non-existing keys. Note that this also
62 changes default value for other keys, as for normal entries. So if you place
66 at the beginning of the file, unknown strings map to 0, and the other strings
67 map to values starting with 1. If label is not specified, the default is
74 =item B<-C>I<.c file> B<--code>=I<.c file>
76 Generate code in the given file.
78 =item B<-H>I<header file> B<--header>=I<header file>
80 Generate a header in the given file, containing a declaration of the hash
81 function and an enumeration.
83 =item B<--enum-name=>I<word>
85 The name of the enumeration.
87 =item B<--function-name=>I<word>
89 The name of the function.
91 =item B<--namespace=>I<name>
93 Put the function and enum into a namespace (C++)
95 =item B<--class=>I<name>
97 Put the function and enum into a class (C++)
101 Generate an enum class instead of an enum (C++)
103 =item B<--counter-name=>I<name>
105 Use I<name> for a counter that is set to the latest entry in the enumeration
106 + 1. This can be useful for defining array sizes.
110 Wrap everything into an extern "C" block. Not compatible with the C++
111 options, as a header with namespaces, classes, or enum classes is not
114 =item B<--multi-byte>=I<value>
116 Generate code reading multiple bytes at once. The value is a string of power
117 of twos to enable. The default value is 320 meaning that 8, 4, and single byte
118 reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you
119 also want to allow 2-byte reads. 2-byte reads are disabled by default because
120 they negatively affect performance on older Intel architectures.
122 This generates code for both multiple bytes and single byte reads, but only
123 enables the multiple byte reads of GNU C compatible compilers, as the following
128 =item Byte-aligned integers
130 We must be able to generate integers that are aligned to a single byte using:
132 typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
136 The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined.
140 We forcefully disable multi-byte reads on platforms where the variable
141 I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined,
142 as there is a measurable overhead from emulating the unaligned reads on
145 =item B<--language=>I<language>
147 Generate a file in the specified language. Currently known are 'C' and 'tree',
148 the latter generating a tree.
150 =item B<--include=>I<header>
152 Add the header to the include statements of the header file. The value must
153 be surrounded by quotes or angle brackets for C code. May be specified multiple
161 my $unknown_label = "Unknown";
162 my $counter_start = 0;
163 my $enum_name = "PerfectKey";
164 my $function_name = "PerfectHash";
168 my $header_name = "-";
172 my $multi_byte = "320";
174 my $counter_name = undef;
178 Getopt
::Long
::config
('default',
185 GetOptions
("code|C=s" => \
$code_name,
186 "header|H=s" => \
$header_name,
187 "function-name=s" => \
$function_name,
188 "ignore-case" => \
$ignore_case,
189 "enum-name=s" => \
$enum_name,
190 "language|l=s" => \
$language,
191 "multi-byte=s" => \
$multi_byte,
192 "enum-class" => \
$enum_class,
193 "include=s" => \
@includes,
194 "counter-name=s" => \
$counter_name)
195 or die("Could not parse options!");
205 $self->{children
} = {};
206 $self->{value
} = undef;
207 $self->{label
} = undef;
212 # Return the largest power of 2 smaller or equal to the argument
214 my ($self, $length) = @_;
216 return 8 if ($length >= 8 && $multi_byte =~ /3/);
217 return 4 if ($length >= 4 && $multi_byte =~ /2/);
218 return 2 if ($length >= 2 && $multi_byte =~ /1/);
223 # Split the key into a head block and a tail
225 my ($self, $key) = @_;
226 my $length = length $key;
227 my $split = $self->alignpower2($length);
229 return (substr($key, 0, $split), substr($key, $split));
233 my ($self, $key, $label, $value) = @_;
235 if (length($key) == 0) {
236 $self->{label
} = $label;
237 $self->{value
} = $value;
241 my ($child, $tail) = $self->split_key($key);
243 $self->{children
}{$child} = Trie-
>new if (!defined($self->{children
}{$child}));
245 $self->{children
}{$child}->insert($tail, $label, $value);
249 my ($self, $togo) = @_;
255 foreach my $key (sort keys %{$self->{children
}}) {
256 if ($togo > length($key) || defined $self->{children
}{$key}->{value
}) {
257 my $child = $self->{children
}{$key}->filter_depth($togo - length($key));
259 $new->{children
}{$key}= $child if defined $child;
260 $found = 1 if defined $child;
263 return undef if (!$found);
265 $new->{value
} = $self->{value
};
266 $new->{label
} = $self->{label
};
272 # Reinsert all value nodes into the specified $trie, prepending $prefix
274 sub reinsert_value_nodes_into
{
275 my ($self, $trie, $prefix) = @_;
277 $trie->insert($prefix, $self->{label
}, $self->{value
}) if (defined $self->{value
});
279 foreach my $key (sort keys %{$self->{children
}}) {
280 $self->{children
}{$key}->reinsert_value_nodes_into($trie, $prefix . $key);
284 # Find an earlier split due a an ambiguous character
285 sub find_ealier_split
{
286 my ($self, $key) = @_;
289 for my $i (0..length($key)-1) {
290 # If the key starts with an ambiguous character, we need to
291 # take only it. Otherwise, we need to take everything
292 # before the character.
293 return $self->alignpower2($i || 1) if (main
::ambiguous
(substr($key, $i, 1)));
296 return $self->alignpower2(length $key);
299 # Rebuild the trie, splitting at ambigous chars, and unifying key lengths
302 # Determine if/where we need to split before an ambiguous character
303 my $new_split = 99999999999999999;
304 foreach my $key (sort keys %{$self->{children
}}) {
305 my $special_length = $self->find_ealier_split($key);
306 $new_split = $special_length if ($special_length < $new_split);
309 # Start building a new uniform trie
310 my $newself = Trie-
>new;
311 $newself->{label
} = $self->{label
};
312 $newself->{value
} = $self->{value
};
313 $newself->{children
} = {};
315 foreach my $key (sort keys %{$self->{children
}}) {
316 my $head = substr($key, 0, $new_split);
317 my $tail = substr($key, $new_split);
318 # Rebuild the child node at $head, pushing $tail downwards
319 $newself->{children
}{$head} //= Trie-
>new;
320 $self->{children
}{$key}->reinsert_value_nodes_into($newself->{children
}{$head}, $tail);
321 # We took up to one special character of each key label. There might
322 # be more, so we need to rebuild recursively.
323 $newself->{children
}{$head} = $newself->{children
}{$head}->rebuild_tree();
330 # Code generator for C and C++
332 my $static = ($code_name eq $header_name) ? "static" : "";
333 my $enum_specifier = $enum_class ? "enum class" : "enum";
345 if ($code_name ne "-") {
346 open($code, '>', $code_name) or die "Cannot open $code_name: $!" ;
350 if($code_name eq $header_name) {
352 } elsif ($header_name ne "-") {
353 open($header, '>', $header_name) or die "Cannot open $header_name: $!" ;
360 my ($class, $word) = @_;
367 # Return a case label, by shifting and or-ing bytes in the word
369 my ($self, $key) = @_;
371 return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte;
375 for my $i (0..length($key)-1) {
376 $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key));
382 # Return an appropriate read instruction for $length bytes from $offset
384 my ($self, $offset, $length) = @_;
386 return "string[$offset]" if $length == 1;
387 return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8);
391 my ($self, $trie, $fh, $indent, $index) = @_;
395 if (defined $trie->{value
}) {
396 printf $fh (" " x
$indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label
});
400 # The difference between lowercase and uppercase alphabetical characters
401 # is that they have one bit flipped. If we have alphabetical characters
402 # in the search space, and the entire search space works fine if we
403 # always turn on the flip, just OR the character we are switching over
405 my $want_use_bit = 0;
408 foreach my $key (sort keys %{$trie->{children
}}) {
409 $can_use_bit &= not main
::ambiguous
($key);
410 $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
411 $key_length = length($key);
414 if ($ignore_case && $can_use_bit && $want_use_bit) {
415 printf $fh ((" " x
$indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x
$key_length);
417 printf $fh ((" " x
$indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
421 foreach my $key (sort keys %{$trie->{children
}}) {
423 printf $fh (" " x
$indent . " break;\n");
426 printf $fh (" " x
$indent . "case %s:\n", $self->case_label(lc($key)));
427 printf $fh (" " x
$indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
429 printf $fh (" " x
$indent . "case %s:\n", $self->case_label($key));
432 $self->print_table($trie->{children
}{$key}, $fh, $indent + 1, $index + length($key));
437 printf $fh (" " x
$indent . "}\n");
441 my ($self, $trie, $fh, $indent, $sofar) = @_;
447 printf $fh (" " x
$indent."%s = %s,\n", $trie->{label
}, $trie->{value
}) if defined $trie->{value
};
449 foreach my $key (sort keys %{$trie->{children
}}) {
450 $self->print_words($trie->{children
}{$key}, $fh, $indent, $sofar . $key);
454 sub print_functions
{
455 my ($self, $trie, %lengths) = @_;
456 foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
457 print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n");
459 $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1);
460 printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
466 my ($self, $trie, $num_values, %lengths) = @_;
467 print $header ("#ifndef TRIE_HASH_${function_name}\n");
468 print $header ("#define TRIE_HASH_${function_name}\n");
469 print $header ("#include <stddef.h>\n");
470 print $header ("#include <stdint.h>\n");
471 foreach my $include (@includes) {
472 print $header ("#include $include\n");
474 printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name));
475 print $header ("${enum_specifier} ${enum_name} {\n");
476 $self->print_words($trie, $header, 1);
477 printf $header (" $unknown_label = $unknown,\n");
478 print $header ("};\n");
479 print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n");
481 print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name);
484 print $code ("#ifdef __GNUC__\n");
485 for (my $i=16; $i <= 64; $i *= 2) {
486 print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n");
487 print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n");
490 print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n");
491 print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n");
492 print $code ("#else\n");
493 print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n");
494 print $code ("#endif\n");
495 print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n");
496 print $code ("#define TRIE_HASH_MULTI_BYTE\n");
497 print $code ("#endif\n");
498 print $code ("#endif /*GNUC */\n");
500 print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n");
501 $self->print_functions($trie, %lengths);
503 print $code ("#else\n");
504 $self->print_functions($trie, %lengths);
505 print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n");
507 $self->print_functions($trie, %lengths);
510 print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n");
512 print $code (" switch (length) {\n");
513 foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
514 print $code (" case $local_length:\n");
515 print $code (" return ${function_name}${local_length}(string);\n");
517 print $code (" default:\n");
518 printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
519 print $code (" }\n");
522 # Print end of header here, in case header and code point to the same file
523 print $header ("#endif /* TRIE_HASH_${function_name} */\n");
527 # Check if the word can be reached by exactly one word in (alphabet OR 0x20).
531 foreach my $char (split //, $word) {
532 # Setting the lowercase flag in the character produces a different
533 # character, the character would thus not be matched.
534 return 1 if ((ord($char) | 0x20) != ord(lc($char)));
536 # A word is also ambiguous if any character in lowercase can be reached
537 # by ORing 0x20 from another character in the charset that is not a
538 # lowercase character of the current character.
539 # Assume that we have UTF-8 and the most significant bit can be set
541 return 1 if (($i | 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char));
550 my $trie = Trie-
>new;
552 my $counter = $counter_start;
555 open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!";
556 while (my $line = <$input>) {
557 my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/;
560 $counter = $value if defined($value);
561 $label //= $codegen->word_to_label($word);
563 $trie->insert($word, $label, $counter);
564 $lengths{length($word)} = 1;
566 } elsif (defined $value) {
568 $unknown_label = $label if defined($label);
569 $counter = $value + 1;
571 die "Invalid line: $line";
575 return ($trie, $counter, %lengths);
578 # Generates an ASCII art tree
579 package TreeCodeGen
{
590 my ($self, $word) = @_;
595 my ($self, $trie, $counter, %lengths) = @_;
596 printf $code ("┌────────────────────────────────────────────────────┐\n");
597 printf $code ("│ Initial trie │\n");
598 printf $code ("└────────────────────────────────────────────────────┘\n");
600 printf $code ("┌────────────────────────────────────────────────────┐\n");
601 printf $code ("│ Rebuilt trie │\n");
602 printf $code ("└────────────────────────────────────────────────────┘\n");
603 $self->print($trie->rebuild_tree());
605 foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
606 printf $code ("┌────────────────────────────────────────────────────┐\n");
607 printf $code ("│ Trie for words of length %-4d │\n", $local_length);
608 printf $code ("└────────────────────────────────────────────────────┘\n");
609 $self->print($trie->filter_depth($local_length)->rebuild_tree());
615 if ($code_name ne "-") {
616 open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ;
624 my ($self, $trie, $depth) = @_;
627 print $code (" → ") if defined($trie->{label
});
628 print $code ($trie->{label
} // "", "\n");
629 foreach my $key (sort keys %{$trie->{children
}}) {
630 print $code ("│ " x
($depth), "├── $key");
631 $self->print($trie->{children
}{$key}, $depth + 1);
638 tree
=> "TreeCodeGen",
642 defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens);
643 my $codegen = $codegens{$language}->new();
644 my ($trie, $counter, %lengths) = build_trie
($codegen);
646 $codegen->open_output();
647 $codegen->main($trie, $counter, %lengths);
652 triehash is available under the MIT/Expat license, see the source code
653 for more information.
657 Julian Andres Klode <jak@jak-linux.org>