Merge commit 'e2073b0276226b625897ef475f225bf8f508719e' as 'triehash'

author Julian Andres Klode <jak@debian.org>

Tue, 22 Nov 2016 21:57:46 +0000 (22:57 +0100)

committer Julian Andres Klode <jak@debian.org>

Tue, 22 Nov 2016 21:57:46 +0000 (22:57 +0100)
author Julian Andres Klode <jak@debian.org>
Tue, 22 Nov 2016 21:57:46 +0000 (22:57 +0100)
committer Julian Andres Klode <jak@debian.org>
Tue, 22 Nov 2016 21:57:46 +0000 (22:57 +0100)
diff --cc triehash/.travis.yml

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..c66ea96973873eff69f45a95eaa4c1d7e2864078

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/triehash/.travis.yml
@@@ -1,0 -1,0 +1,14 @@@
++language: perl
++perl:
++  - "5.24"
++  - "5.22"
++  - "5.20"
++  - "5.18"
++  - "5.16"
++  - "5.14"
++install:
++  - cpanm --quiet --notest --skip-satisfied Devel::Cover Devel::Cover::Report::Codecov
++script:
++  - ./tests/run-tests.sh
++after_script:
++  - cover -report codecov
diff --cc triehash/LICENSE.md

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..89de354795ec7a7cdab07c091029653d3618540d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/triehash/LICENSE.md
@@@ -1,0 -1,0 +1,17 @@@
++Permission is hereby granted, free of charge, to any person obtaining a copy
++of this software and associated documentation files (the "Software"), to deal
++in the Software without restriction, including without limitation the rights
++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++copies of the Software, and to permit persons to whom the Software is
++furnished to do so, subject to the following conditions:
++
++The above copyright notice and this permission notice shall be included in
++all copies or substantial portions of the Software.
++
++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++THE SOFTWARE.
diff --cc triehash/README.md

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..d6f6e7fc3c52fa8e4d561602dadfcd0c4511ccde

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/triehash/README.md
@@@ -1,0 -1,0 +1,58 @@@
++# Order-preserving minimal perfect hash function generator
++
++Build order-preserving minimal perfect hash functions.
++
++[![codecov](https://codecov.io/gh/julian-klode/triehash/branch/master/graph/badge.svg)](https://codecov.io/gh/julian-klode/triehash)
++[![Build Status](https://travis-ci.org/julian-klode/triehash.svg?branch=master)](https://travis-ci.org/julian-klode/triehash)
++
++## Performance
++
++Performance was evaluated against other hash functions. As an input set, the
++fields of Debian Packages and Sources files was used, and each hash function
++was run 1,000,000 times for each word. The byte count of the words were then
++summed up and divided by the total number of nanoseconds each function ran, so
++all speeds below are given in bytes per nanosecond, AKA gigabyte per second.
++
++arch/function|jak-x230 (amd64)|backup (amd64)|asachi.d.o (arm64)|asachi.d.o (armel)|asachi.d.o (armhf)|plummer.d.o (ppc64el)|eller.d.o (mipsel)
++-------------|----------------|--------------|------------------|------------------|------------------|---------------------|------------------
++Trie         |      2.4       |      1.9     |      1.2         |      0.9         |      0.8         |      2.0            |      0.2
++Trie (*)     |      2.2       |      1.7     |      0.8         |      0.7         |      0.7         |      1.8            |      0.2
++re2c         |      1.7       |      1.3     |      0.9         |      0.9         |      0.7         |      1.6            |      0.2
++re2c (*)     |      1.2       |      0.9     |      0.6         |      0.6         |      0.5         |      1.1            |      0.1
++gperf (*)    |      0.7       |      0.5     |      0.2         |      0.2         |      0.2         |      0.5            |      0.1
++gperf        |      1.3       |      0.9     |      0.3         |      0.3         |      0.2         |      0.4            |      0.1
++djb (*)      |      0.7       |      0.5     |      0.3         |      0.3         |      0.3         |      0.5            |      0.1
++djb (**)     |      1.0       |      0.7     |      0.4         |      0.5         |      0.5         |      0.6            |      0.2
++djb          |      0.9       |      0.7     |      0.5         |      0.5         |      0.5         |      0.7            |      0.2
++apt (*)      |      1.2       |      0.9     |      0.7         |      0.7         |      0.7         |      1.1            |      0.2
++apt (**)     |      2.3       |      1.7     |      0.7         |      0.9         |      0.8         |      1.9            |      0.2
++
++And transposed:
++
++function/arch        |Trie     |Trie (*) |re2c     |re2c (*) |gperf (*)|gperf    |djb (*)  |djb (**) |djb      |apt (*)  |apt (**)
++---------------------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------
++jak-x230 (amd64)     |      2.4|      2.2|      1.7|      1.2|      0.7|      1.3|      0.7|      1.0|      0.9|      1.2|      2.3
++backup (amd64)       |      1.9|      1.7|      1.3|      0.9|      0.5|      0.9|      0.5|      0.7|      0.7|      0.9|      1.7
++asachi.d.o (arm64)   |      1.2|      0.8|      0.9|      0.6|      0.2|      0.3|      0.3|      0.4|      0.5|      0.7|      0.7
++asachi.d.o (armel)   |      0.9|      0.7|      0.9|      0.6|      0.2|      0.3|      0.3|      0.5|      0.5|      0.7|      0.9
++asachi.d.o (armhf)   |      0.8|      0.7|      0.7|      0.5|      0.2|      0.2|      0.3|      0.5|      0.5|      0.7|      0.8
++plummer.d.o (ppc64el)|      2.0|      1.8|      1.6|      1.1|      0.5|      0.4|      0.5|      0.6|      0.7|      1.1|      1.9
++eller.d.o (mipsel)   |      0.2|      0.2|      0.2|      0.1|      0.1|      0.1|      0.1|      0.2|      0.2|      0.2|      0.2
++
++
++Legend:
++
++* The (*) variants are case-insensitive, (**) are more optimised versions
++  of the (*) versions.
++* DJB (*) is a DJB Hash with naive lowercase conversion, DJB (**) just ORs one
++  bit into each value to get alphabetical characters to be lowercase
++* APT (*) is the AlphaHash function from APT which hashes the last 8 bytes in a
++  word in a case-insensitive manner. APT (**) is the same function unrolled.
++* All hosts except the x230 are Debian porterboxes. The x230 has a Core i5-3320M,
++  barriere has an Opteron 23xx.
++
++Notes:
++
++* The overhead is larger than needed on some platforms due to gcc inserting
++  unneeded zero extend instructions, see:
++  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77729
diff --cc triehash/tests/framework.sh

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..51d4580a6ba4e4ac548db539362503d28374af7c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/triehash/tests/framework.sh
@@@ -1,0 -1,0 +1,84 @@@
++#!/bin/sh
++# Simple integration test framework
++
++set -e
++
++
++cleanup() {
++    rm -f test.output test.c test.h test.tree
++}
++
++dumpone() {
++    if [ -e "$@" ]; then
++        echo "Content of $@:"
++        cat "$@" | sed "s#^#\t#g"
++    fi
++
++}
++
++dump() {
++    dumpone test.output
++    dumpone test.c
++    dumpone test.h
++    dumpone test.tree
++    return 1
++}
++
++testsuccess() {
++    [ "$INNER" ] || cleanup
++    [ "$INNER" ] || echo "Testing success of $@"
++    if ! "$@" > test.output 2>&1; then
++        echo "ERROR: Running $@ failed with error $?, messages were:" >&2
++        dump
++        return 1
++    fi
++}
++
++testfailure() {
++    [ "$INNER" ] || cleanup
++    [ "$INNER" ] || echo "Testing failure of $@"
++    if "$@" > test.output 2>&1; then
++        echo "ERROR: Running $@ unexpectedly succeeded, messages were:" >&2
++        dump
++        return 1
++    fi
++}
++
++testfileequal() {
++    [ "$INNER" ] ||  echo "Testing output of $2"
++    printf "%b\n" "$1" > expected
++    if ! diff -u "expected" "$2" > test.diff; then
++        echo "ERROR: Differences between expected output and and $2:" >&2
++        cat test.diff | sed "s#^#\t#g"
++        dump
++        return 1
++    fi
++}
++
++testgrep() {
++    [ "$INNER" ] ||  echo "Testing grep $@"
++    INNER=1 testsuccess grep "$@"
++    unset INNER
++}
++
++testsuccessequal() {
++    expect="$1"
++    shift
++    cleanup
++    echo "Testing success and output of $@"
++    INNER=1 testsuccess "$@"
++    INNER=1 testfileequal "$expect" test.output
++    unset INNER
++}
++
++
++WORDS="Word-_0
++Word = 42
++VeryLongWord
++Label ~ Word2
++= -9"
++
++triehash() {
++    printf "%b\n" "$WORDS" | perl -MDevel::Cover=-summary,0,-silent,1 $(dirname $(dirname $(readlink -f $0)))/triehash.pl "$@" || return $?
++    return $?
++}
diff --cc triehash/tests/run-tests.sh

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..b9c1ec30945ee69524e43c87c0c20370e6eb35ee

new file mode 100755 (executable)
--- /dev/null
--- /dev/null
+++ b/triehash/tests/run-tests.sh
@@@ -1,0 -1,0 +1,22 @@@
++#!/bin/sh
++DIR=$(dirname $(readlink -f $0))
++
++# Let's go into triehash.pl's dir
++cd $(dirname "$DIR")
++
++rm -rf cover_db
++
++count=$(cd "$DIR" && echo test-* | wc -w)
++i=1
++
++for test in $DIR/test-*; do
++    echo "[$i/$count] Running testcase $test"
++    if ! $test > test.summary 2>&1; then
++        cat test.summary
++        exit 1
++    fi
++    i=$((i + 1))
++done
++
++
++cover
diff --cc triehash/tests/test-basic

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..19cb086842f56adfe41c8c904ea373a249888f0c

new file mode 100755 (executable)
--- /dev/null
--- /dev/null
+++ b/triehash/tests/test-basic
@@@ -1,0 -1,0 +1,245 @@@
++#!/bin/sh
++. $(dirname $(readlink -f $0))/framework.sh
++
++# Check for non-existing files
++testfailure triehash -C /does/not/exist1 -H /does/not/exist1 /does/not/exist/input
++
++# Check that we can specify - for -C and -H
++testsuccessequal "#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum PerfectKey {
++    Unknown = -1,
++};
++static enum PerfectKey PerfectHash(const char *string, size_t length);
++static enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    default:
++        return Unknown;
++    }
++}
++#endif                       /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 -C - -H -
++
++# Check that split files work
++testsuccess triehash -C test.c -H test.h --multi-byte=0
++testfileequal "#include \"test.h\"
++ enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    default:
++        return Unknown;
++    }
++}" test.c
++testfileequal "#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum PerfectKey {
++    Unknown = -1,
++};
++ enum PerfectKey PerfectHash(const char *string, size_t length);
++#endif                       /* TRIE_HASH_PerfectHash */" test.h
++
++
++# Check the C code generator
++testsuccess triehash -C test.c -H test.c /dev/stdin
++testfileequal "#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum PerfectKey {
++    VeryLongWord = 43,
++    Word = 42,
++    Word___0 = 0,
++    Label = 44,
++    Unknown = -9,
++};
++static enum PerfectKey PerfectHash(const char *string, size_t length);
++#ifdef __GNUC__
++typedef uint16_t __attribute__((aligned (1))) triehash_uu16;
++typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1];
++typedef uint32_t __attribute__((aligned (1))) triehash_uu32;
++typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1];
++typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
++typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1];
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++#define onechar(c, s, l) (((uint64_t)(c)) << (s))
++#else
++#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))
++#endif
++#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)
++#define TRIE_HASH_MULTI_BYTE
++#endif
++#endif /*GNUC */
++#ifdef TRIE_HASH_MULTI_BYTE
++static enum PerfectKey PerfectHash4(const char *string)
++{
++    switch(*((triehash_uu32*) &string[0])) {
++    case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++        return Word;
++    }
++    return Unknown;
++}
++static enum PerfectKey PerfectHash5(const char *string)
++{
++    switch(*((triehash_uu32*) &string[0])) {
++    case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++        switch(string[4]) {
++        case 0| onechar('2', 0, 8):
++            return Label;
++        }
++    }
++    return Unknown;
++}
++static enum PerfectKey PerfectHash7(const char *string)
++{
++    switch(*((triehash_uu32*) &string[0])) {
++    case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++        switch(string[4]) {
++        case 0| onechar('-', 0, 8):
++            switch(string[5]) {
++            case 0| onechar('_', 0, 8):
++                switch(string[6]) {
++                case 0| onechar('0', 0, 8):
++                    return Word___0;
++                }
++            }
++        }
++    }
++    return Unknown;
++}
++static enum PerfectKey PerfectHash12(const char *string)
++{
++    switch(*((triehash_uu64*) &string[0])) {
++    case 0| onechar('V', 0, 64)| onechar('e', 8, 64)| onechar('r', 16, 64)| onechar('y', 24, 64)| onechar('L', 32, 64)| onechar('o', 40, 64)| onechar('n', 48, 64)| onechar('g', 56, 64):
++        switch(*((triehash_uu32*) &string[8])) {
++        case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++            return VeryLongWord;
++        }
++    }
++    return Unknown;
++}
++#else
++static enum PerfectKey PerfectHash4(const char *string)
++{
++    switch(string[0]) {
++    case 'W':
++        switch(string[1]) {
++        case 'o':
++            switch(string[2]) {
++            case 'r':
++                switch(string[3]) {
++                case 'd':
++                    return Word;
++                }
++            }
++        }
++    }
++    return Unknown;
++}
++static enum PerfectKey PerfectHash5(const char *string)
++{
++    switch(string[0]) {
++    case 'W':
++        switch(string[1]) {
++        case 'o':
++            switch(string[2]) {
++            case 'r':
++                switch(string[3]) {
++                case 'd':
++                    switch(string[4]) {
++                    case '2':
++                        return Label;
++                    }
++                }
++            }
++        }
++    }
++    return Unknown;
++}
++static enum PerfectKey PerfectHash7(const char *string)
++{
++    switch(string[0]) {
++    case 'W':
++        switch(string[1]) {
++        case 'o':
++            switch(string[2]) {
++            case 'r':
++                switch(string[3]) {
++                case 'd':
++                    switch(string[4]) {
++                    case '-':
++                        switch(string[5]) {
++                        case '_':
++                            switch(string[6]) {
++                            case '0':
++                                return Word___0;
++                            }
++                        }
++                    }
++                }
++            }
++        }
++    }
++    return Unknown;
++}
++static enum PerfectKey PerfectHash12(const char *string)
++{
++    switch(string[0]) {
++    case 'V':
++        switch(string[1]) {
++        case 'e':
++            switch(string[2]) {
++            case 'r':
++                switch(string[3]) {
++                case 'y':
++                    switch(string[4]) {
++                    case 'L':
++                        switch(string[5]) {
++                        case 'o':
++                            switch(string[6]) {
++                            case 'n':
++                                switch(string[7]) {
++                                case 'g':
++                                    switch(string[8]) {
++                                    case 'W':
++                                        switch(string[9]) {
++                                        case 'o':
++                                            switch(string[10]) {
++                                            case 'r':
++                                                switch(string[11]) {
++                                                case 'd':
++                                                    return VeryLongWord;
++                                                }
++                                            }
++                                        }
++                                    }
++                                }
++                            }
++                        }
++                    }
++                }
++            }
++        }
++    }
++    return Unknown;
++}
++#endif /* TRIE_HASH_MULTI_BYTE */
++static enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    case 4:
++        return PerfectHash4(string);
++    case 5:
++        return PerfectHash5(string);
++    case 7:
++        return PerfectHash7(string);
++    case 12:
++        return PerfectHash12(string);
++    default:
++        return Unknown;
++    }
++}
++#endif                       /* TRIE_HASH_PerfectHash */"  test.c
diff --cc triehash/tests/test-case-insensitive

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..25ab2dd7810f5b231eb3d2a7e3971b950d4f5ccf

new file mode 100755 (executable)
--- /dev/null
--- /dev/null
+++ b/triehash/tests/test-case-insensitive
@@@ -1,0 -1,0 +1,109 @@@
++#!/bin/sh
++. $(dirname $(readlink -f $0))/framework.sh
++
++WORDS="Halllo\nH-lllo\nHalll1"
++
++# Case-insensitive test
++testsuccessequal "#include \"/dev/null\"
++#ifdef __GNUC__
++typedef uint16_t __attribute__((aligned (1))) triehash_uu16;
++typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1];
++typedef uint32_t __attribute__((aligned (1))) triehash_uu32;
++typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1];
++typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
++typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1];
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++#define onechar(c, s, l) (((uint64_t)(c)) << (s))
++#else
++#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))
++#endif
++#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)
++#define TRIE_HASH_MULTI_BYTE
++#endif
++#endif /*GNUC */
++#ifdef TRIE_HASH_MULTI_BYTE
++static enum PerfectKey PerfectHash6(const char *string)
++{
++    switch(string[0] | 0x20) {
++    case 0| onechar('h', 0, 8):
++        switch(string[1]) {
++        case 0| onechar('-', 0, 8):
++            switch(*((triehash_uu32*) &string[2]) | 0x20202020) {
++            case 0| onechar('l', 0, 32)| onechar('l', 8, 32)| onechar('l', 16, 32)| onechar('o', 24, 32):
++                return H_lllo;
++            }
++            break;
++        case 0| onechar('a', 0, 8):
++        case 0| onechar('A', 0, 8):
++            switch(*((triehash_uu16*) &string[2]) | 0x2020) {
++            case 0| onechar('l', 0, 16)| onechar('l', 8, 16):
++                switch(string[4] | 0x20) {
++                case 0| onechar('l', 0, 8):
++                    switch(string[5]) {
++                    case 0| onechar('1', 0, 8):
++                        return Halll1;
++                        break;
++                    case 0| onechar('o', 0, 8):
++                    case 0| onechar('O', 0, 8):
++                        return Halllo;
++                    }
++                }
++            }
++        }
++    }
++    return Unknown;
++}
++#else
++static enum PerfectKey PerfectHash6(const char *string)
++{
++    switch(string[0] | 0x20) {
++    case 'h':
++        switch(string[1]) {
++        case '-':
++            switch(string[2] | 0x20) {
++            case 'l':
++                switch(string[3] | 0x20) {
++                case 'l':
++                    switch(string[4] | 0x20) {
++                    case 'l':
++                        switch(string[5] | 0x20) {
++                        case 'o':
++                            return H_lllo;
++                        }
++                    }
++                }
++            }
++            break;
++        case 'a':
++        case 'A':
++            switch(string[2] | 0x20) {
++            case 'l':
++                switch(string[3] | 0x20) {
++                case 'l':
++                    switch(string[4] | 0x20) {
++                    case 'l':
++                        switch(string[5]) {
++                        case '1':
++                            return Halll1;
++                            break;
++                        case 'o':
++                        case 'O':
++                            return Halllo;
++                        }
++                    }
++                }
++            }
++        }
++    }
++    return Unknown;
++}
++#endif /* TRIE_HASH_MULTI_BYTE */
++ enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    case 6:
++        return PerfectHash6(string);
++    default:
++        return Unknown;
++    }
++}" triehash --multi-byte=3210 --ignore-case -H /dev/null /dev/stdin
diff --cc triehash/tests/test-enum-include-name-flags

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..33bd97c0fd54170a26658a8103c2c27c2637510a

new file mode 100755 (executable)
--- /dev/null
--- /dev/null
+++ b/triehash/tests/test-enum-include-name-flags
@@@ -1,0 -1,0 +1,129 @@@
++#!/bin/sh
++. $(dirname $(readlink -f $0))/framework.sh
++
++# Need a short word, we really just need to check if the labels work
++WORDS=w
++
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++#include <foo.h>
++enum PerfectKey {
++    w = 0,
++    Unknown = -1,
++};
++ enum PerfectKey PerfectHash(const char *string, size_t length);
++#endif                       /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 -C /dev/null --include="<foo.h>" /dev/stdin
++
++# Check for --enum-class support
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum class PerfectKey {
++    w = 0,
++    Unknown = -1,
++};
++static enum PerfectKey PerfectHash(const char *string, size_t length);
++static enum PerfectKey PerfectHash1(const char *string)
++{
++    switch(string[0]) {
++    case 'w':
++        return PerfectKey::w;
++    }
++    return PerfectKey::Unknown;
++}
++static enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    case 1:
++        return PerfectHash1(string);
++    default:
++        return PerfectKey::Unknown;
++    }
++}
++#endif                       /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 --enum-class /dev/stdin
++
++# Check for --enum-name support
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum Foo {
++    Unknown = -1,
++};
++static enum Foo PerfectHash(const char *string, size_t length);
++static enum Foo PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    default:
++        return Unknown;
++    }
++}
++#endif                       /* TRIE_HASH_PerfectHash */\
++" triehash --multi-byte=0 --enum-name="Foo"
++
++# Check for --enum-class support
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum class Foo::Bar {
++    Unknown = -1,
++};
++static enum Foo::Bar PerfectHash(const char *string, size_t length);
++static enum Foo::Bar PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    default:
++        return Foo::Bar::Unknown;
++    }
++}
++#endif                       /* TRIE_HASH_PerfectHash */\
++" triehash --multi-byte=0 --enum-class --enum-name="Foo::Bar"
++
++# Check for --function-name support
++testsuccessequal "\
++#ifndef TRIE_HASH_NonSense
++#define TRIE_HASH_NonSense
++#include <stddef.h>
++#include <stdint.h>
++enum PerfectKey {
++    Unknown = -1,
++};
++static enum PerfectKey NonSense(const char *string, size_t length);
++static enum PerfectKey NonSense(const char *string, size_t length)
++{
++    switch (length) {
++    default:
++        return Unknown;
++    }
++}
++#endif                       /* TRIE_HASH_NonSense */\
++" triehash --multi-byte=0 --function-name="NonSense"
++
++# Check for --counter-name support
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum { MyCounter = 0 };
++enum PerfectKey {
++    Unknown = -1,
++};
++static enum PerfectKey PerfectHash(const char *string, size_t length);
++static enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    default:
++        return Unknown;
++    }
++}
++#endif                       /* TRIE_HASH_PerfectHash */\
++" triehash --multi-byte=0 --counter-name="MyCounter"
diff --cc triehash/tests/test-multi-byte-level

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..ddfb8cd1b34622ac964240dd832ba6fbb68f823e

new file mode 100755 (executable)
--- /dev/null
--- /dev/null
+++ b/triehash/tests/test-multi-byte-level
@@@ -1,0 -1,0 +1,427 @@@
++#!/bin/sh
++. $(dirname $(readlink -f $0))/framework.sh
++
++# Check that building a single-byte trie works
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│                   Initial trie                     │
++└────────────────────────────────────────────────────┘
++
++├── V
++│   ├── e
++│   │   ├── r
++│   │   │   ├── y
++│   │   │   │   ├── L
++│   │   │   │   │   ├── o
++│   │   │   │   │   │   ├── n
++│   │   │   │   │   │   │   ├── g
++│   │   │   │   │   │   │   │   ├── W
++│   │   │   │   │   │   │   │   │   ├── o
++│   │   │   │   │   │   │   │   │   │   ├── r
++│   │   │   │   │   │   │   │   │   │   │   ├── d → VeryLongWord
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d → Word
++│   │   │   │   ├── -
++│   │   │   │   │   ├── _
++│   │   │   │   │   │   ├── 0 → Word-_0
++│   │   │   │   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│                   Rebuilt trie                     │
++└────────────────────────────────────────────────────┘
++
++├── V
++│   ├── e
++│   │   ├── r
++│   │   │   ├── y
++│   │   │   │   ├── L
++│   │   │   │   │   ├── o
++│   │   │   │   │   │   ├── n
++│   │   │   │   │   │   │   ├── g
++│   │   │   │   │   │   │   │   ├── W
++│   │   │   │   │   │   │   │   │   ├── o
++│   │   │   │   │   │   │   │   │   │   ├── r
++│   │   │   │   │   │   │   │   │   │   │   ├── d → VeryLongWord
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d → Word
++│   │   │   │   ├── -
++│   │   │   │   │   ├── _
++│   │   │   │   │   │   ├── 0 → Word-_0
++│   │   │   │   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 4            │
++└────────────────────────────────────────────────────┘
++
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d → Word
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 5            │
++└────────────────────────────────────────────────────┘
++
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d
++│   │   │   │   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 7            │
++└────────────────────────────────────────────────────┘
++
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d
++│   │   │   │   ├── -
++│   │   │   │   │   ├── _
++│   │   │   │   │   │   ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 12           │
++└────────────────────────────────────────────────────┘
++
++├── V
++│   ├── e
++│   │   ├── r
++│   │   │   ├── y
++│   │   │   │   ├── L
++│   │   │   │   │   ├── o
++│   │   │   │   │   │   ├── n
++│   │   │   │   │   │   │   ├── g
++│   │   │   │   │   │   │   │   ├── W
++│   │   │   │   │   │   │   │   │   ├── o
++│   │   │   │   │   │   │   │   │   │   ├── r
++│   │   │   │   │   │   │   │   │   │   │   ├── d → VeryLongWord" triehash --multi-byte=0 -l tree /dev/stdin
++
++# Two byte optimization
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│                   Initial trie                     │
++└────────────────────────────────────────────────────┘
++
++├── Ve
++│   ├── ry
++│   │   ├── Lo
++│   │   │   ├── ng
++│   │   │   │   ├── Wo
++│   │   │   │   │   ├── rd → VeryLongWord
++├── Wo
++│   ├── rd → Word
++│   │   ├── -_
++│   │   │   ├── 0 → Word-_0
++│   │   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│                   Rebuilt trie                     │
++└────────────────────────────────────────────────────┘
++
++├── Ve
++│   ├── ry
++│   │   ├── Lo
++│   │   │   ├── ng
++│   │   │   │   ├── Wo
++│   │   │   │   │   ├── rd → VeryLongWord
++├── Wo
++│   ├── rd → Word
++│   │   ├── -
++│   │   │   ├── _0 → Word-_0
++│   │   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 4            │
++└────────────────────────────────────────────────────┘
++
++├── Wo
++│   ├── rd → Word
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 5            │
++└────────────────────────────────────────────────────┘
++
++├── Wo
++│   ├── rd
++│   │   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 7            │
++└────────────────────────────────────────────────────┘
++
++├── Wo
++│   ├── rd
++│   │   ├── -_
++│   │   │   ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 12           │
++└────────────────────────────────────────────────────┘
++
++├── Ve
++│   ├── ry
++│   │   ├── Lo
++│   │   │   ├── ng
++│   │   │   │   ├── Wo
++│   │   │   │   │   ├── rd → VeryLongWord" triehash --multi-byte=1 -l tree /dev/stdin
++# Four byte optimization
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│                   Initial trie                     │
++└────────────────────────────────────────────────────┘
++
++├── Very
++│   ├── Long
++│   │   ├── Word → VeryLongWord
++├── Word → Word
++│   ├── -
++│   │   ├── _
++│   │   │   ├── 0 → Word-_0
++│   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│                   Rebuilt trie                     │
++└────────────────────────────────────────────────────┘
++
++├── Very
++│   ├── Long
++│   │   ├── Word → VeryLongWord
++├── Word → Word
++│   ├── -
++│   │   ├── _
++│   │   │   ├── 0 → Word-_0
++│   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 4            │
++└────────────────────────────────────────────────────┘
++
++├── Word → Word
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 5            │
++└────────────────────────────────────────────────────┘
++
++├── Word
++│   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 7            │
++└────────────────────────────────────────────────────┘
++
++├── Word
++│   ├── -
++│   │   ├── _
++│   │   │   ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 12           │
++└────────────────────────────────────────────────────┘
++
++├── Very
++│   ├── Long
++│   │   ├── Word → VeryLongWord" triehash --multi-byte=2 -l tree /dev/stdin
++# Eigh byte optimization
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│                   Initial trie                     │
++└────────────────────────────────────────────────────┘
++
++├── VeryLong
++│   ├── W
++│   │   ├── o
++│   │   │   ├── r
++│   │   │   │   ├── d → VeryLongWord
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d → Word
++│   │   │   │   ├── -
++│   │   │   │   │   ├── _
++│   │   │   │   │   │   ├── 0 → Word-_0
++│   │   │   │   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│                   Rebuilt trie                     │
++└────────────────────────────────────────────────────┘
++
++├── V
++│   ├── eryLongW
++│   │   ├── o
++│   │   │   ├── r
++│   │   │   │   ├── d → VeryLongWord
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d → Word
++│   │   │   │   ├── -
++│   │   │   │   │   ├── _
++│   │   │   │   │   │   ├── 0 → Word-_0
++│   │   │   │   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 4            │
++└────────────────────────────────────────────────────┘
++
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d → Word
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 5            │
++└────────────────────────────────────────────────────┘
++
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d
++│   │   │   │   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 7            │
++└────────────────────────────────────────────────────┘
++
++├── W
++│   ├── o
++│   │   ├── r
++│   │   │   ├── d
++│   │   │   │   ├── -
++│   │   │   │   │   ├── _
++│   │   │   │   │   │   ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 12           │
++└────────────────────────────────────────────────────┘
++
++├── VeryLong
++│   ├── W
++│   │   ├── o
++│   │   │   ├── r
++│   │   │   │   ├── d → VeryLongWord" triehash --multi-byte=3 -l tree /dev/stdin
++
++
++# Check that building a multi-byte trie works
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│                   Initial trie                     │
++└────────────────────────────────────────────────────┘
++
++├── VeryLong
++│   ├── Word → VeryLongWord
++├── Word → Word
++│   ├── -
++│   │   ├── _
++│   │   │   ├── 0 → Word-_0
++│   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│                   Rebuilt trie                     │
++└────────────────────────────────────────────────────┘
++
++├── Very
++│   ├── LongWord → VeryLongWord
++├── Word → Word
++│   ├── -
++│   │   ├── _
++│   │   │   ├── 0 → Word-_0
++│   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 4            │
++└────────────────────────────────────────────────────┘
++
++├── Word → Word
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 5            │
++└────────────────────────────────────────────────────┘
++
++├── Word
++│   ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 7            │
++└────────────────────────────────────────────────────┘
++
++├── Word
++│   ├── -
++│   │   ├── _
++│   │   │   ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│              Trie for words of length 12           │
++└────────────────────────────────────────────────────┘
++
++├── VeryLong
++│   ├── Word → VeryLongWord" triehash -l tree /dev/stdin
++
++
++###### CHANGE THE WORDS FOR THE FOLLOWING TESTS #######
++WORDS="Word"
++
++# Check that we are generating the proper multi-byte and fallback sessions
++testsuccessequal "#include \"/dev/null\"
++#ifdef __GNUC__
++typedef uint16_t __attribute__((aligned (1))) triehash_uu16;
++typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1];
++typedef uint32_t __attribute__((aligned (1))) triehash_uu32;
++typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1];
++typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
++typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1];
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++#define onechar(c, s, l) (((uint64_t)(c)) << (s))
++#else
++#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))
++#endif
++#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)
++#define TRIE_HASH_MULTI_BYTE
++#endif
++#endif /*GNUC */
++#ifdef TRIE_HASH_MULTI_BYTE
++static enum PerfectKey PerfectHash4(const char *string)
++{
++    switch(*((triehash_uu32*) &string[0])) {
++    case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++        return Word;
++    }
++    return Unknown;
++}
++#else
++static enum PerfectKey PerfectHash4(const char *string)
++{
++    switch(string[0]) {
++    case 'W':
++        switch(string[1]) {
++        case 'o':
++            switch(string[2]) {
++            case 'r':
++                switch(string[3]) {
++                case 'd':
++                    return Word;
++                }
++            }
++        }
++    }
++    return Unknown;
++}
++#endif /* TRIE_HASH_MULTI_BYTE */
++ enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    case 4:
++        return PerfectHash4(string);
++    default:
++        return Unknown;
++    }
++}" triehash -H /dev/null /dev/stdin
++
++
++# Check that we are generating no multi-byte session
++testsuccessequal "#include \"/dev/null\"
++static enum PerfectKey PerfectHash4(const char *string)
++{
++    switch(string[0]) {
++    case 'W':
++        switch(string[1]) {
++        case 'o':
++            switch(string[2]) {
++            case 'r':
++                switch(string[3]) {
++                case 'd':
++                    return Word;
++                }
++            }
++        }
++    }
++    return Unknown;
++}
++ enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++    switch (length) {
++    case 4:
++        return PerfectHash4(string);
++    default:
++        return Unknown;
++    }
++}" triehash --multi-byte=0 -H /dev/null /dev/stdin
diff --cc triehash/triehash.pl

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..c722cc62d861ca615f83015ff7b590479345b055

new file mode 100755 (executable)
--- /dev/null
--- /dev/null
+++ b/triehash/triehash.pl
@@@ -1,0 -1,0 +1,660 @@@
++#!/usr/bin/perl -w
++#
++# Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org>
++#
++# Permission is hereby granted, free of charge, to any person obtaining a copy
++# of this software and associated documentation files (the "Software"), to deal
++# in the Software without restriction, including without limitation the rights
++# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++# copies of the Software, and to permit persons to whom the Software is
++# furnished to do so, subject to the following conditions:
++#
++# The above copyright notice and this permission notice shall be included in
++# all copies or substantial portions of the Software.
++#
++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++# THE SOFTWARE.
++
++
++=head1 NAME
++
++triehash - Generate a perfect hash function derived from a trie.
++
++=cut
++
++use strict;
++use warnings;
++use Getopt::Long;
++
++=head1 SYNOPSIS
++
++B<triehash> [S<I<option>>] [S<I<input file>>]
++
++=head1 DESCRIPTION
++
++triehash takes a list of words in input file and generates a function and
++an enumeration to describe the word
++
++=head1 INPUT FILE FORMAT
++
++The file consists of multiple lines of the form:
++
++    [label ~ ] word [= value]
++
++This maps word to value, and generates an enumeration with entries of the form:
++
++    label = value
++
++If I<label> is undefined, the word will be used, the minus character will be
++replaced by an underscore. If value is undefined it is counted upwards from
++the last value.
++
++There may also be one line of the format
++
++    [ label ~] = value
++
++Which defines the value to be used for non-existing keys. Note that this also
++changes default value for other keys, as for normal entries. So if you place
++
++    = 0
++
++at the beginning of the file, unknown strings map to 0, and the other strings
++map to values starting with 1. If label is not specified, the default is
++I<Unknown>.
++
++=head1 OPTIONS
++
++=over 4
++
++=item B<-C>I<.c file> B<--code>=I<.c file>
++
++Generate code in the given file.
++
++=item B<-H>I<header file> B<--header>=I<header file>
++
++Generate a header in the given file, containing a declaration of the hash
++function and an enumeration.
++
++=item B<--enum-name=>I<word>
++
++The name of the enumeration.
++
++=item B<--function-name=>I<word>
++
++The name of the function.
++
++=item B<--namespace=>I<name>
++
++Put the function and enum into a namespace (C++)
++
++=item B<--class=>I<name>
++
++Put the function and enum into a class (C++)
++
++=item B<--enum-class>
++
++Generate an enum class instead of an enum (C++)
++
++=item B<--counter-name=>I<name>
++
++Use I<name> for a counter that is set to the latest entry in the enumeration
+++ 1. This can be useful for defining array sizes.
++
++=item B<--extern-c>
++
++Wrap everything into an extern "C" block. Not compatible with the C++
++options, as a header with namespaces, classes, or enum classes is not
++valid C.
++
++=item B<--multi-byte>=I<value>
++
++Generate code reading multiple bytes at once. The value is a string of power
++of twos to enable. The default value is 320 meaning that 8, 4, and single byte
++reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you
++also want to allow 2-byte reads. 2-byte reads are disabled by default because
++they negatively affect performance on older Intel architectures.
++
++This generates code for both multiple bytes and single byte reads, but only
++enables the multiple byte reads of GNU C compatible compilers, as the following
++extensions are used:
++
++=over 8
++
++=item Byte-aligned integers
++
++We must be able to generate integers that are aligned to a single byte using:
++
++    typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
++
++=item Byte-order
++
++The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined.
++
++=back
++
++We forcefully disable multi-byte reads on platforms where the variable
++I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined,
++as there is a measurable overhead from emulating the unaligned reads on
++ARM.
++
++=item B<--language=>I<language>
++
++Generate a file in the specified language. Currently known are 'C' and 'tree',
++the latter generating a tree.
++
++=item B<--include=>I<header>
++
++Add the header to the include statements of the header file. The value must
++be surrounded by quotes or angle brackets for C code. May be specified multiple
++times.
++
++=back
++
++=cut
++
++my $unknown = -1;
++my $unknown_label = "Unknown";
++my $counter_start = 0;
++my $enum_name = "PerfectKey";
++my $function_name = "PerfectHash";
++my $enum_class = 0;
++
++my $code_name = "-";
++my $header_name = "-";
++my $code;
++my $header;
++my $ignore_case = 0;
++my $multi_byte = "320";
++my $language = 'C';
++my $counter_name = undef;
++my @includes = ();
++
++
++Getopt::Long::config('default',
++                     'bundling',
++                     'no_getopt_compat',
++                     'no_auto_abbrev',
++                     'permute',
++                     'auto_help');
++
++GetOptions ("code|C=s" => \$code_name,
++            "header|H=s"   => \$header_name,
++            "function-name=s" => \$function_name,
++            "ignore-case" => \$ignore_case,
++            "enum-name=s" => \$enum_name,
++            "language|l=s" => \$language,
++            "multi-byte=s" => \$multi_byte,
++            "enum-class" => \$enum_class,
++            "include=s" => \@includes,
++            "counter-name=s" => \$counter_name)
++    or die("Could not parse options!");
++
++
++package Trie {
++
++    sub new {
++        my $class = shift;
++        my $self = {};
++        bless $self, $class;
++
++        $self->{children} = {};
++        $self->{value} = undef;
++        $self->{label} = undef;
++
++        return $self;
++    }
++
++    # Return the largest power of 2 smaller or equal to the argument
++    sub alignpower2 {
++        my ($self, $length) = @_;
++
++        return 8 if ($length >= 8 && $multi_byte =~ /3/);
++        return 4 if ($length >= 4 && $multi_byte =~ /2/);
++        return 2 if ($length >= 2 && $multi_byte =~ /1/);
++
++        return 1;
++    }
++
++    # Split the key into a head block and a tail
++    sub split_key {
++        my ($self, $key) = @_;
++        my $length = length $key;
++        my $split = $self->alignpower2($length);
++
++        return (substr($key, 0, $split), substr($key, $split));
++    }
++
++    sub insert {
++        my ($self, $key, $label, $value) = @_;
++
++        if (length($key) == 0) {
++            $self->{label} = $label;
++            $self->{value} = $value;
++            return;
++        }
++
++        my ($child, $tail) = $self->split_key($key);
++
++        $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child}));
++
++        $self->{children}{$child}->insert($tail, $label, $value);
++    }
++
++    sub filter_depth {
++        my ($self, $togo) = @_;
++
++        my $new = Trie->new;
++
++        if ($togo != 0) {
++            my $found = 0;
++            foreach my $key (sort keys %{$self->{children}}) {
++                if ($togo > length($key) || defined $self->{children}{$key}->{value}) {
++                    my $child = $self->{children}{$key}->filter_depth($togo - length($key));
++
++                    $new->{children}{$key}= $child if defined $child;
++                    $found = 1 if defined $child;
++                }
++            }
++            return undef if (!$found);
++        } else {
++            $new->{value} = $self->{value};
++            $new->{label} = $self->{label};
++        }
++
++        return $new;
++    }
++
++    # Reinsert all value nodes into the specified $trie, prepending $prefix
++    # to their $paths.
++    sub reinsert_value_nodes_into {
++        my ($self, $trie, $prefix) = @_;
++
++        $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value});
++
++        foreach my $key (sort keys %{$self->{children}}) {
++            $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key);
++        }
++    }
++
++    # Find an earlier split due a an ambiguous character
++    sub find_ealier_split {
++        my ($self, $key) = @_;
++
++        if ($ignore_case) {
++            for my $i (0..length($key)-1) {
++                # If the key starts with an ambiguous character, we need to
++                # take only it. Otherwise, we need to take everything
++                # before the character.
++                return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1)));
++            }
++        }
++        return $self->alignpower2(length $key);
++    }
++
++    # Rebuild the trie, splitting at ambigous chars, and unifying key lengths
++    sub rebuild_tree {
++        my $self = shift;
++        # Determine if/where we need to split before an ambiguous character
++        my $new_split = 99999999999999999;
++        foreach my $key (sort keys %{$self->{children}}) {
++            my $special_length = $self->find_ealier_split($key);
++            $new_split = $special_length if ($special_length < $new_split);
++        }
++
++        # Start building a new uniform trie
++        my $newself = Trie->new;
++        $newself->{label} = $self->{label};
++        $newself->{value} = $self->{value};
++        $newself->{children} = {};
++
++        foreach my $key (sort keys %{$self->{children}}) {
++            my $head = substr($key, 0, $new_split);
++            my $tail = substr($key, $new_split);
++            # Rebuild the child node at $head, pushing $tail downwards
++            $newself->{children}{$head} //= Trie->new;
++            $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail);
++            # We took up to one special character of each key label. There might
++            # be more, so we need to rebuild recursively.
++            $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree();
++        }
++
++        return $newself;
++    }
++}
++
++# Code generator for C and C++
++package CCodeGen {
++    my $static = ($code_name eq $header_name) ? "static" : "";
++    my $enum_specifier = $enum_class ? "enum class" : "enum";
++
++    sub new {
++        my $class = shift;
++        my $self = {};
++        bless $self, $class;
++
++        return $self;
++    }
++
++    sub open_output {
++        my $self = shift;
++        if ($code_name ne "-") {
++            open($code, '>', $code_name) or die "Cannot open $code_name: $!" ;
++        } else {
++            $code = *STDOUT;
++        }
++        if($code_name eq $header_name) {
++            $header = $code;
++        } elsif ($header_name ne "-") {
++            open($header, '>', $header_name) or die "Cannot open $header_name: $!" ;
++        } else {
++            $header = *STDOUT;
++        }
++    }
++
++    sub word_to_label {
++        my ($class, $word) = @_;
++
++        $word =~ s/_/__/g;
++        $word =~ s/-/_/g;
++        return $word;
++    }
++
++    # Return a case label, by shifting and or-ing bytes in the word
++    sub case_label {
++        my ($self, $key) = @_;
++
++        return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte;
++
++        my $output = '0';
++
++        for my $i (0..length($key)-1) {
++            $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key));
++        }
++
++        return $output;
++    }
++
++    # Return an appropriate read instruction for $length bytes from $offset
++    sub switch_key {
++        my ($self, $offset, $length) = @_;
++
++        return "string[$offset]" if $length == 1;
++        return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8);
++    }
++
++    sub print_table {
++        my ($self, $trie, $fh, $indent, $index) = @_;
++        $indent //= 0;
++        $index //= 0;
++
++        if (defined $trie->{value}) {
++            printf $fh ("    " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
++            return;
++        }
++
++        # The difference between lowercase and uppercase alphabetical characters
++        # is that they have one bit flipped. If we have alphabetical characters
++        # in the search space, and the entire search space works fine if we
++        # always turn on the flip, just OR the character we are switching over
++        # with the bit.
++        my $want_use_bit = 0;
++        my $can_use_bit = 1;
++        my $key_length = 0;
++        foreach my $key (sort keys %{$trie->{children}}) {
++            $can_use_bit &= not main::ambiguous($key);
++            $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
++            $key_length = length($key);
++        }
++
++        if ($ignore_case && $can_use_bit && $want_use_bit) {
++            printf $fh (("    " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
++        } else {
++            printf $fh (("    " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
++        }
++
++        my $notfirst = 0;
++        foreach my $key (sort keys %{$trie->{children}}) {
++            if ($notfirst) {
++                printf $fh ("    " x $indent . "    break;\n");
++            }
++            if ($ignore_case) {
++                printf $fh ("    " x $indent . "case %s:\n", $self->case_label(lc($key)));
++                printf $fh ("    " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
++            } else {
++                printf $fh ("    " x $indent . "case %s:\n", $self->case_label($key));
++            }
++
++            $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));
++
++            $notfirst=1;
++        }
++
++        printf $fh ("    " x $indent . "}\n");
++    }
++
++    sub print_words {
++        my ($self, $trie, $fh, $indent, $sofar) = @_;
++
++        $indent //= 0;
++        $sofar //= "";
++
++
++        printf $fh ("    " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value};
++
++        foreach my $key (sort keys %{$trie->{children}}) {
++            $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key);
++        }
++    }
++
++    sub print_functions {
++        my ($self, $trie, %lengths) = @_;
++        foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
++            print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n");
++            print $code ("{\n");
++            $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1);
++            printf $code ("    return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
++            print $code ("}\n");
++        }
++    }
++
++    sub main {
++        my ($self, $trie, $num_values, %lengths) = @_;
++        print $header ("#ifndef TRIE_HASH_${function_name}\n");
++        print $header ("#define TRIE_HASH_${function_name}\n");
++        print $header ("#include <stddef.h>\n");
++        print $header ("#include <stdint.h>\n");
++        foreach my $include (@includes) {
++            print $header ("#include $include\n");
++        }
++        printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name));
++        print $header ("${enum_specifier} ${enum_name} {\n");
++        $self->print_words($trie, $header, 1);
++        printf $header ("    $unknown_label = $unknown,\n");
++        print $header ("};\n");
++        print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n");
++
++        print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name);
++
++        if ($multi_byte) {
++            print $code ("#ifdef __GNUC__\n");
++            for (my $i=16; $i <= 64; $i *= 2) {
++                print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n");
++                print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n");
++            }
++
++            print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n");
++            print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n");
++            print $code ("#else\n");
++            print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n");
++            print $code ("#endif\n");
++            print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n");
++            print $code ("#define TRIE_HASH_MULTI_BYTE\n");
++            print $code ("#endif\n");
++            print $code ("#endif /*GNUC */\n");
++
++            print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n");
++            $self->print_functions($trie, %lengths);
++            $multi_byte = 0;
++            print $code ("#else\n");
++            $self->print_functions($trie, %lengths);
++            print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n");
++        } else {
++            $self->print_functions($trie, %lengths);
++        }
++
++        print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n");
++        print $code ("{\n");
++        print $code ("    switch (length) {\n");
++        foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
++            print $code ("    case $local_length:\n");
++            print $code ("        return ${function_name}${local_length}(string);\n");
++        }
++        print $code ("    default:\n");
++        printf $code ("        return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
++        print $code ("    }\n");
++        print $code ("}\n");
++
++        # Print end of header here, in case header and code point to the same file
++        print $header ("#endif                       /* TRIE_HASH_${function_name} */\n");
++    }
++}
++
++# Check if the word can be reached by exactly one word in (alphabet OR 0x20).
++sub ambiguous {
++    my $word = shift;
++
++    foreach my $char (split //, $word) {
++        # Setting the lowercase flag in the character produces a different
++        # character, the character would thus not be matched.
++        return 1 if ((ord($char) | 0x20) != ord(lc($char)));
++
++        # A word is also ambiguous if any character in lowercase can be reached
++        # by ORing 0x20 from another character in the charset that is not a
++        # lowercase character of the current character.
++        # Assume that we have UTF-8 and the most significant bit can be set
++        for my $i (0..255) {
++            return 1 if (($i | 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char));
++        }
++    }
++
++    return 0;
++}
++
++sub build_trie {
++    my $codegen = shift;
++    my $trie = Trie->new;
++
++    my $counter = $counter_start;
++    my %lengths;
++
++    open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!";
++    while (my $line = <$input>) {
++        my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/;
++
++        if (defined $word) {
++            $counter = $value if defined($value);
++            $label //= $codegen->word_to_label($word);
++
++            $trie->insert($word, $label, $counter);
++            $lengths{length($word)} = 1;
++            $counter++;
++        } elsif (defined $value) {
++            $unknown = $value;
++            $unknown_label = $label if defined($label);
++            $counter = $value + 1;
++        } else {
++            die "Invalid line: $line";
++        }
++    }
++
++    return ($trie, $counter, %lengths);
++}
++
++# Generates an ASCII art tree
++package TreeCodeGen {
++
++    sub new {
++        my $class = shift;
++        my $self = {};
++        bless $self, $class;
++
++        return $self;
++    }
++
++    sub word_to_label {
++        my ($self, $word) = @_;
++        return $word;
++    }
++
++    sub main {
++        my ($self, $trie, $counter, %lengths) = @_;
++        printf $code ("┌────────────────────────────────────────────────────┐\n");
++        printf $code ("│                   Initial trie                     │\n");
++        printf $code ("└────────────────────────────────────────────────────┘\n");
++        $self->print($trie);
++        printf $code ("┌────────────────────────────────────────────────────┐\n");
++        printf $code ("│                   Rebuilt trie                     │\n");
++        printf $code ("└────────────────────────────────────────────────────┘\n");
++        $self->print($trie->rebuild_tree());
++
++        foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
++            printf $code ("┌────────────────────────────────────────────────────┐\n");
++            printf $code ("│              Trie for words of length %-4d         │\n", $local_length);
++            printf $code ("└────────────────────────────────────────────────────┘\n");
++            $self->print($trie->filter_depth($local_length)->rebuild_tree());
++        }
++    }
++
++    sub open_output {
++        my $self = shift;
++        if ($code_name ne "-") {
++            open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ;
++        } else {
++            $code = *STDOUT;
++        }
++    }
++
++    # Print a trie
++    sub print {
++        my ($self, $trie, $depth) = @_;
++        $depth //= 0;
++
++        print $code (" → ") if defined($trie->{label});
++        print $code ($trie->{label} // "", "\n");
++        foreach my $key (sort keys %{$trie->{children}}) {
++            print $code ("│   " x ($depth), "├── $key");
++            $self->print($trie->{children}{$key}, $depth + 1);
++        }
++    }
++}
++
++my %codegens = (
++    C => "CCodeGen",
++    tree => "TreeCodeGen",
++);
++
++
++defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens);
++my $codegen = $codegens{$language}->new();
++my ($trie, $counter, %lengths) = build_trie($codegen);
++
++$codegen->open_output();
++$codegen->main($trie, $counter, %lengths);
++
++
++=head1 LICENSE
++
++triehash is available under the MIT/Expat license, see the source code
++for more information.
++
++=head1 AUTHOR
++
++Julian Andres Klode <jak@jak-linux.org>
++
++=cut
++
author	Julian Andres Klode <jak@debian.org>
	Tue, 22 Nov 2016 21:57:46 +0000 (22:57 +0100)
committer	Julian Andres Klode <jak@debian.org>
	Tue, 22 Nov 2016 21:57:46 +0000 (22:57 +0100)
		1	2
triehash/.travis.yml	patch \|	\|	\|	blob
triehash/LICENSE.md	patch \|	\|	\|	blob
triehash/README.md	patch \|	\|	\|	blob
triehash/tests/framework.sh	patch \|	\|	\|	blob
triehash/tests/run-tests.sh	patch \|	\|	\|	blob
triehash/tests/test-basic	patch \|	\|	\|	blob
triehash/tests/test-case-insensitive	patch \|	\|	\|	blob
triehash/tests/test-enum-include-name-flags	patch \|	\|	\|	blob
triehash/tests/test-multi-byte-level	patch \|	\|	\|	blob
triehash/triehash.pl	patch \|	\|	\|	blob