[apt.git] / triehash / triehash.pl

#!/usr/bin/perl -w
#
# Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.


=head1 NAME

triehash - Generate a perfect hash function derived from a trie.

=cut

use strict;
use warnings;
use Getopt::Long;

=head1 SYNOPSIS

B<triehash> [S<I<option>>] [S<I<input file>>]

=head1 DESCRIPTION

triehash takes a list of words in input file and generates a function and
an enumeration to describe the word

=head1 INPUT FILE FORMAT

The file consists of multiple lines of the form:

    [label ~ ] word [= value]

This maps word to value, and generates an enumeration with entries of the form:

    label = value

If I<label> is undefined, the word will be used, the minus character will be
replaced by an underscore. If value is undefined it is counted upwards from
the last value.

There may also be one line of the format

    [ label ~] = value

Which defines the value to be used for non-existing keys. Note that this also
changes default value for other keys, as for normal entries. So if you place

    = 0

at the beginning of the file, unknown strings map to 0, and the other strings
map to values starting with 1. If label is not specified, the default is
I<Unknown>.

=head1 OPTIONS

=over 4

=item B<-C>I<.c file> B<--code>=I<.c file>

Generate code in the given file.

=item B<-H>I<header file> B<--header>=I<header file>

Generate a header in the given file, containing a declaration of the hash
function and an enumeration.

=item B<--enum-name=>I<word>

The name of the enumeration.

=item B<--function-name=>I<word>

The name of the function.

=item B<--namespace=>I<name>

Put the function and enum into a namespace (C++)

=item B<--class=>I<name>

Put the function and enum into a class (C++)

=item B<--enum-class>

Generate an enum class instead of an enum (C++)

=item B<--counter-name=>I<name>

Use I<name> for a counter that is set to the latest entry in the enumeration
+ 1. This can be useful for defining array sizes.

=item B<--extern-c>

Wrap everything into an extern "C" block. Not compatible with the C++
options, as a header with namespaces, classes, or enum classes is not
valid C.

=item B<--multi-byte>=I<value>

Generate code reading multiple bytes at once. The value is a string of power
of twos to enable. The default value is 320 meaning that 8, 4, and single byte
reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you
also want to allow 2-byte reads. 2-byte reads are disabled by default because
they negatively affect performance on older Intel architectures.

This generates code for both multiple bytes and single byte reads, but only
enables the multiple byte reads of GNU C compatible compilers, as the following
extensions are used:

=over 8

=item Byte-aligned integers

We must be able to generate integers that are aligned to a single byte using:

    typedef uint64_t __attribute__((aligned (1))) triehash_uu64;

=item Byte-order

The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined.

=back

We forcefully disable multi-byte reads on platforms where the variable
I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined,
as there is a measurable overhead from emulating the unaligned reads on
ARM.

=item B<--language=>I<language>

Generate a file in the specified language. Currently known are 'C' and 'tree',
the latter generating a tree.

=item B<--include=>I<header>

Add the header to the include statements of the header file. The value must
be surrounded by quotes or angle brackets for C code. May be specified multiple
times.

=back

=cut

my $unknown = -1;
my $unknown_label = "Unknown";
my $counter_start = 0;
my $enum_name = "PerfectKey";
my $function_name = "PerfectHash";
my $enum_class = 0;

my $code_name = "-";
my $header_name = "-";
my $code;
my $header;
my $ignore_case = 0;
my $multi_byte = "320";
my $language = 'C';
my $counter_name = undef;
my @includes = ();


Getopt::Long::config('default',
                     'bundling',
                     'no_getopt_compat',
                     'no_auto_abbrev',
                     'permute',
                     'auto_help');

GetOptions ("code|C=s" => \$code_name,
            "header|H=s"   => \$header_name,
            "function-name=s" => \$function_name,
            "ignore-case" => \$ignore_case,
            "enum-name=s" => \$enum_name,
            "language|l=s" => \$language,
            "multi-byte=s" => \$multi_byte,
            "enum-class" => \$enum_class,
            "include=s" => \@includes,
            "counter-name=s" => \$counter_name)
    or die("Could not parse options!");


package Trie {

    sub new {
        my $class = shift;
        my $self = {};
        bless $self, $class;

        $self->{children} = {};
        $self->{value} = undef;
        $self->{label} = undef;

        return $self;
    }

    # Return the largest power of 2 smaller or equal to the argument
    sub alignpower2 {
        my ($self, $length) = @_;

        return 8 if ($length >= 8 && $multi_byte =~ /3/);
        return 4 if ($length >= 4 && $multi_byte =~ /2/);
        return 2 if ($length >= 2 && $multi_byte =~ /1/);

        return 1;
    }

    # Split the key into a head block and a tail
    sub split_key {
        my ($self, $key) = @_;
        my $length = length $key;
        my $split = $self->alignpower2($length);

        return (substr($key, 0, $split), substr($key, $split));
    }

    sub insert {
        my ($self, $key, $label, $value) = @_;

        if (length($key) == 0) {
            $self->{label} = $label;
            $self->{value} = $value;
            return;
        }

        my ($child, $tail) = $self->split_key($key);

        $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child}));

        $self->{children}{$child}->insert($tail, $label, $value);
    }

    sub filter_depth {
        my ($self, $togo) = @_;

        my $new = Trie->new;

        if ($togo != 0) {
            my $found = 0;
            foreach my $key (sort keys %{$self->{children}}) {
                if ($togo > length($key) || defined $self->{children}{$key}->{value}) {
                    my $child = $self->{children}{$key}->filter_depth($togo - length($key));

                    $new->{children}{$key}= $child if defined $child;
                    $found = 1 if defined $child;
                }
            }
            return undef if (!$found);
        } else {
            $new->{value} = $self->{value};
            $new->{label} = $self->{label};
        }

        return $new;
    }

    # Reinsert all value nodes into the specified $trie, prepending $prefix
    # to their $paths.
    sub reinsert_value_nodes_into {
        my ($self, $trie, $prefix) = @_;

        $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value});

        foreach my $key (sort keys %{$self->{children}}) {
            $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key);
        }
    }

    # Find an earlier split due a an ambiguous character
    sub find_ealier_split {
        my ($self, $key) = @_;

        if ($ignore_case) {
            for my $i (0..length($key)-1) {
                # If the key starts with an ambiguous character, we need to
                # take only it. Otherwise, we need to take everything
                # before the character.
                return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1)));
            }
        }
        return $self->alignpower2(length $key);
    }

    # Rebuild the trie, splitting at ambigous chars, and unifying key lengths
    sub rebuild_tree {
        my $self = shift;
        # Determine if/where we need to split before an ambiguous character
        my $new_split = 99999999999999999;
        foreach my $key (sort keys %{$self->{children}}) {
            my $special_length = $self->find_ealier_split($key);
            $new_split = $special_length if ($special_length < $new_split);
        }

        # Start building a new uniform trie
        my $newself = Trie->new;
        $newself->{label} = $self->{label};
        $newself->{value} = $self->{value};
        $newself->{children} = {};

        foreach my $key (sort keys %{$self->{children}}) {
            my $head = substr($key, 0, $new_split);
            my $tail = substr($key, $new_split);
            # Rebuild the child node at $head, pushing $tail downwards
            $newself->{children}{$head} //= Trie->new;
            $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail);
            # We took up to one special character of each key label. There might
            # be more, so we need to rebuild recursively.
            $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree();
        }

        return $newself;
    }
}

# Code generator for C and C++
package CCodeGen {
    my $static = ($code_name eq $header_name) ? "static" : "";
    my $enum_specifier = $enum_class ? "enum class" : "enum";

    sub new {
        my $class = shift;
        my $self = {};
        bless $self, $class;

        return $self;
    }

    sub open_output {
        my $self = shift;
        if ($code_name ne "-") {
            open($code, '>', $code_name) or die "Cannot open $code_name: $!" ;
        } else {
            $code = *STDOUT;
        }
        if($code_name eq $header_name) {
            $header = $code;
        } elsif ($header_name ne "-") {
            open($header, '>', $header_name) or die "Cannot open $header_name: $!" ;
        } else {
            $header = *STDOUT;
        }
    }

    sub word_to_label {
        my ($class, $word) = @_;

        $word =~ s/_/__/g;
        $word =~ s/-/_/g;
        return $word;
    }

    # Return a case label, by shifting and or-ing bytes in the word
    sub case_label {
        my ($self, $key) = @_;

        return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte;

        my $output = '0';

        for my $i (0..length($key)-1) {
            $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key));
        }

        return $output;
    }

    # Return an appropriate read instruction for $length bytes from $offset
    sub switch_key {
        my ($self, $offset, $length) = @_;

        return "string[$offset]" if $length == 1;
        return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8);
    }

    sub print_table {
        my ($self, $trie, $fh, $indent, $index) = @_;
        $indent //= 0;
        $index //= 0;

        if (defined $trie->{value}) {
            printf $fh ("    " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
            return;
        }

        # The difference between lowercase and uppercase alphabetical characters
        # is that they have one bit flipped. If we have alphabetical characters
        # in the search space, and the entire search space works fine if we
        # always turn on the flip, just OR the character we are switching over
        # with the bit.
        my $want_use_bit = 0;
        my $can_use_bit = 1;
        my $key_length = 0;
        foreach my $key (sort keys %{$trie->{children}}) {
            $can_use_bit &= not main::ambiguous($key);
            $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
            $key_length = length($key);
        }

        if ($ignore_case && $can_use_bit && $want_use_bit) {
            printf $fh (("    " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
        } else {
            printf $fh (("    " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
        }

        my $notfirst = 0;
        foreach my $key (sort keys %{$trie->{children}}) {
            if ($notfirst) {
                printf $fh ("    " x $indent . "    break;\n");
            }
            if ($ignore_case) {
                printf $fh ("    " x $indent . "case %s:\n", $self->case_label(lc($key)));
                printf $fh ("    " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
            } else {
                printf $fh ("    " x $indent . "case %s:\n", $self->case_label($key));
            }

            $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));

            $notfirst=1;
        }

        printf $fh ("    " x $indent . "}\n");
    }

    sub print_words {
        my ($self, $trie, $fh, $indent, $sofar) = @_;

        $indent //= 0;
        $sofar //= "";


        printf $fh ("    " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value};

        foreach my $key (sort keys %{$trie->{children}}) {
            $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key);
        }
    }

    sub print_functions {
        my ($self, $trie, %lengths) = @_;
        foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
            print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n");
            print $code ("{\n");
            $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1);
            printf $code ("    return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
            print $code ("}\n");
        }
    }

    sub main {
        my ($self, $trie, $num_values, %lengths) = @_;
        print $header ("#ifndef TRIE_HASH_${function_name}\n");
        print $header ("#define TRIE_HASH_${function_name}\n");
        print $header ("#include <stddef.h>\n");
        print $header ("#include <stdint.h>\n");
        foreach my $include (@includes) {
            print $header ("#include $include\n");
        }
        printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name));
        print $header ("${enum_specifier} ${enum_name} {\n");
        $self->print_words($trie, $header, 1);
        printf $header ("    $unknown_label = $unknown,\n");
        print $header ("};\n");
        print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n");

        print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name);

        if ($multi_byte) {
            print $code ("#ifdef __GNUC__\n");
            for (my $i=16; $i <= 64; $i *= 2) {
                print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n");
                print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n");
            }

            print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n");
            print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n");
            print $code ("#else\n");
            print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n");
            print $code ("#endif\n");
            print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n");
            print $code ("#define TRIE_HASH_MULTI_BYTE\n");
            print $code ("#endif\n");
            print $code ("#endif /*GNUC */\n");

            print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n");
            $self->print_functions($trie, %lengths);
            $multi_byte = 0;
            print $code ("#else\n");
            $self->print_functions($trie, %lengths);
            print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n");
        } else {
            $self->print_functions($trie, %lengths);
        }

        print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n");
        print $code ("{\n");
        print $code ("    switch (length) {\n");
        foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
            print $code ("    case $local_length:\n");
            print $code ("        return ${function_name}${local_length}(string);\n");
        }
        print $code ("    default:\n");
        printf $code ("        return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
        print $code ("    }\n");
        print $code ("}\n");

        # Print end of header here, in case header and code point to the same file
        print $header ("#endif                       /* TRIE_HASH_${function_name} */\n");
    }
}

# Check if the word can be reached by exactly one word in (alphabet OR 0x20).
sub ambiguous {
    my $word = shift;

    foreach my $char (split //, $word) {
        # Setting the lowercase flag in the character produces a different
        # character, the character would thus not be matched.
        return 1 if ((ord($char) | 0x20) != ord(lc($char)));

        # A word is also ambiguous if any character in lowercase can be reached
        # by ORing 0x20 from another character in the charset that is not a
        # lowercase character of the current character.
        # Assume that we have UTF-8 and the most significant bit can be set
        for my $i (0..255) {
            return 1 if (($i | 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char));
        }
    }

    return 0;
}

sub build_trie {
    my $codegen = shift;
    my $trie = Trie->new;

    my $counter = $counter_start;
    my %lengths;

    open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!";
    while (my $line = <$input>) {
        my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/;

        if (defined $word) {
            $counter = $value if defined($value);
            $label //= $codegen->word_to_label($word);

            $trie->insert($word, $label, $counter);
            $lengths{length($word)} = 1;
            $counter++;
        } elsif (defined $value) {
            $unknown = $value;
            $unknown_label = $label if defined($label);
            $counter = $value + 1;
        } else {
            die "Invalid line: $line";
        }
    }

    return ($trie, $counter, %lengths);
}

# Generates an ASCII art tree
package TreeCodeGen {

    sub new {
        my $class = shift;
        my $self = {};
        bless $self, $class;

        return $self;
    }

    sub word_to_label {
        my ($self, $word) = @_;
        return $word;
    }

    sub main {
        my ($self, $trie, $counter, %lengths) = @_;
        printf $code ("┌────────────────────────────────────────────────────┐\n");
        printf $code ("│                   Initial trie                     │\n");
        printf $code ("└────────────────────────────────────────────────────┘\n");
        $self->print($trie);
        printf $code ("┌────────────────────────────────────────────────────┐\n");
        printf $code ("│                   Rebuilt trie                     │\n");
        printf $code ("└────────────────────────────────────────────────────┘\n");
        $self->print($trie->rebuild_tree());

        foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
            printf $code ("┌────────────────────────────────────────────────────┐\n");
            printf $code ("│              Trie for words of length %-4d         │\n", $local_length);
            printf $code ("└────────────────────────────────────────────────────┘\n");
            $self->print($trie->filter_depth($local_length)->rebuild_tree());
        }
    }

    sub open_output {
        my $self = shift;
        if ($code_name ne "-") {
            open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ;
        } else {
            $code = *STDOUT;
        }
    }

    # Print a trie
    sub print {
        my ($self, $trie, $depth) = @_;
        $depth //= 0;

        print $code (" → ") if defined($trie->{label});
        print $code ($trie->{label} // "", "\n");
        foreach my $key (sort keys %{$trie->{children}}) {
            print $code ("│   " x ($depth), "├── $key");
            $self->print($trie->{children}{$key}, $depth + 1);
        }
    }
}

my %codegens = (
    C => "CCodeGen",
    tree => "TreeCodeGen",
);


defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens);
my $codegen = $codegens{$language}->new();
my ($trie, $counter, %lengths) = build_trie($codegen);

$codegen->open_output();
$codegen->main($trie, $counter, %lengths);


=head1 LICENSE

triehash is available under the MIT/Expat license, see the source code
for more information.

=head1 AUTHOR

Julian Andres Klode <jak@jak-linux.org>

=cut