--- /dev/null
--- /dev/null
++language: perl
++perl:
++ - "5.24"
++ - "5.22"
++ - "5.20"
++ - "5.18"
++ - "5.16"
++ - "5.14"
++install:
++ - cpanm --quiet --notest --skip-satisfied Devel::Cover Devel::Cover::Report::Codecov
++script:
++ - ./tests/run-tests.sh
++after_script:
++ - cover -report codecov
--- /dev/null
--- /dev/null
++Permission is hereby granted, free of charge, to any person obtaining a copy
++of this software and associated documentation files (the "Software"), to deal
++in the Software without restriction, including without limitation the rights
++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++copies of the Software, and to permit persons to whom the Software is
++furnished to do so, subject to the following conditions:
++
++The above copyright notice and this permission notice shall be included in
++all copies or substantial portions of the Software.
++
++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++THE SOFTWARE.
--- /dev/null
--- /dev/null
++# Order-preserving minimal perfect hash function generator
++
++Build order-preserving minimal perfect hash functions.
++
++[![codecov](https://codecov.io/gh/julian-klode/triehash/branch/master/graph/badge.svg)](https://codecov.io/gh/julian-klode/triehash)
++[![Build Status](https://travis-ci.org/julian-klode/triehash.svg?branch=master)](https://travis-ci.org/julian-klode/triehash)
++
++## Performance
++
++Performance was evaluated against other hash functions. As an input set, the
++fields of Debian Packages and Sources files was used, and each hash function
++was run 1,000,000 times for each word. The byte count of the words were then
++summed up and divided by the total number of nanoseconds each function ran, so
++all speeds below are given in bytes per nanosecond, AKA gigabyte per second.
++
++arch/function|jak-x230 (amd64)|backup (amd64)|asachi.d.o (arm64)|asachi.d.o (armel)|asachi.d.o (armhf)|plummer.d.o (ppc64el)|eller.d.o (mipsel)
++-------------|----------------|--------------|------------------|------------------|------------------|---------------------|------------------
++Trie | 2.4 | 1.9 | 1.2 | 0.9 | 0.8 | 2.0 | 0.2
++Trie (*) | 2.2 | 1.7 | 0.8 | 0.7 | 0.7 | 1.8 | 0.2
++re2c | 1.7 | 1.3 | 0.9 | 0.9 | 0.7 | 1.6 | 0.2
++re2c (*) | 1.2 | 0.9 | 0.6 | 0.6 | 0.5 | 1.1 | 0.1
++gperf (*) | 0.7 | 0.5 | 0.2 | 0.2 | 0.2 | 0.5 | 0.1
++gperf | 1.3 | 0.9 | 0.3 | 0.3 | 0.2 | 0.4 | 0.1
++djb (*) | 0.7 | 0.5 | 0.3 | 0.3 | 0.3 | 0.5 | 0.1
++djb (**) | 1.0 | 0.7 | 0.4 | 0.5 | 0.5 | 0.6 | 0.2
++djb | 0.9 | 0.7 | 0.5 | 0.5 | 0.5 | 0.7 | 0.2
++apt (*) | 1.2 | 0.9 | 0.7 | 0.7 | 0.7 | 1.1 | 0.2
++apt (**) | 2.3 | 1.7 | 0.7 | 0.9 | 0.8 | 1.9 | 0.2
++
++And transposed:
++
++function/arch |Trie |Trie (*) |re2c |re2c (*) |gperf (*)|gperf |djb (*) |djb (**) |djb |apt (*) |apt (**)
++---------------------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------
++jak-x230 (amd64) | 2.4| 2.2| 1.7| 1.2| 0.7| 1.3| 0.7| 1.0| 0.9| 1.2| 2.3
++backup (amd64) | 1.9| 1.7| 1.3| 0.9| 0.5| 0.9| 0.5| 0.7| 0.7| 0.9| 1.7
++asachi.d.o (arm64) | 1.2| 0.8| 0.9| 0.6| 0.2| 0.3| 0.3| 0.4| 0.5| 0.7| 0.7
++asachi.d.o (armel) | 0.9| 0.7| 0.9| 0.6| 0.2| 0.3| 0.3| 0.5| 0.5| 0.7| 0.9
++asachi.d.o (armhf) | 0.8| 0.7| 0.7| 0.5| 0.2| 0.2| 0.3| 0.5| 0.5| 0.7| 0.8
++plummer.d.o (ppc64el)| 2.0| 1.8| 1.6| 1.1| 0.5| 0.4| 0.5| 0.6| 0.7| 1.1| 1.9
++eller.d.o (mipsel) | 0.2| 0.2| 0.2| 0.1| 0.1| 0.1| 0.1| 0.2| 0.2| 0.2| 0.2
++
++
++Legend:
++
++* The (*) variants are case-insensitive, (**) are more optimised versions
++ of the (*) versions.
++* DJB (*) is a DJB Hash with naive lowercase conversion, DJB (**) just ORs one
++ bit into each value to get alphabetical characters to be lowercase
++* APT (*) is the AlphaHash function from APT which hashes the last 8 bytes in a
++ word in a case-insensitive manner. APT (**) is the same function unrolled.
++* All hosts except the x230 are Debian porterboxes. The x230 has a Core i5-3320M,
++ barriere has an Opteron 23xx.
++
++Notes:
++
++* The overhead is larger than needed on some platforms due to gcc inserting
++ unneeded zero extend instructions, see:
++ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77729
--- /dev/null
--- /dev/null
++#!/bin/sh
++# Simple integration test framework
++
++set -e
++
++
++cleanup() {
++ rm -f test.output test.c test.h test.tree
++}
++
++dumpone() {
++ if [ -e "$@" ]; then
++ echo "Content of $@:"
++ cat "$@" | sed "s#^#\t#g"
++ fi
++
++}
++
++dump() {
++ dumpone test.output
++ dumpone test.c
++ dumpone test.h
++ dumpone test.tree
++ return 1
++}
++
++testsuccess() {
++ [ "$INNER" ] || cleanup
++ [ "$INNER" ] || echo "Testing success of $@"
++ if ! "$@" > test.output 2>&1; then
++ echo "ERROR: Running $@ failed with error $?, messages were:" >&2
++ dump
++ return 1
++ fi
++}
++
++testfailure() {
++ [ "$INNER" ] || cleanup
++ [ "$INNER" ] || echo "Testing failure of $@"
++ if "$@" > test.output 2>&1; then
++ echo "ERROR: Running $@ unexpectedly succeeded, messages were:" >&2
++ dump
++ return 1
++ fi
++}
++
++testfileequal() {
++ [ "$INNER" ] || echo "Testing output of $2"
++ printf "%b\n" "$1" > expected
++ if ! diff -u "expected" "$2" > test.diff; then
++ echo "ERROR: Differences between expected output and and $2:" >&2
++ cat test.diff | sed "s#^#\t#g"
++ dump
++ return 1
++ fi
++}
++
++testgrep() {
++ [ "$INNER" ] || echo "Testing grep $@"
++ INNER=1 testsuccess grep "$@"
++ unset INNER
++}
++
++testsuccessequal() {
++ expect="$1"
++ shift
++ cleanup
++ echo "Testing success and output of $@"
++ INNER=1 testsuccess "$@"
++ INNER=1 testfileequal "$expect" test.output
++ unset INNER
++}
++
++
++WORDS="Word-_0
++Word = 42
++VeryLongWord
++Label ~ Word2
++= -9"
++
++triehash() {
++ printf "%b\n" "$WORDS" | perl -MDevel::Cover=-summary,0,-silent,1 $(dirname $(dirname $(readlink -f $0)))/triehash.pl "$@" || return $?
++ return $?
++}
--- /dev/null
--- /dev/null
++#!/bin/sh
++DIR=$(dirname $(readlink -f $0))
++
++# Let's go into triehash.pl's dir
++cd $(dirname "$DIR")
++
++rm -rf cover_db
++
++count=$(cd "$DIR" && echo test-* | wc -w)
++i=1
++
++for test in $DIR/test-*; do
++ echo "[$i/$count] Running testcase $test"
++ if ! $test > test.summary 2>&1; then
++ cat test.summary
++ exit 1
++ fi
++ i=$((i + 1))
++done
++
++
++cover
--- /dev/null
--- /dev/null
++#!/bin/sh
++. $(dirname $(readlink -f $0))/framework.sh
++
++# Check for non-existing files
++testfailure triehash -C /does/not/exist1 -H /does/not/exist1 /does/not/exist/input
++
++# Check that we can specify - for -C and -H
++testsuccessequal "#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum PerfectKey {
++ Unknown = -1,
++};
++static enum PerfectKey PerfectHash(const char *string, size_t length);
++static enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ default:
++ return Unknown;
++ }
++}
++#endif /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 -C - -H -
++
++# Check that split files work
++testsuccess triehash -C test.c -H test.h --multi-byte=0
++testfileequal "#include \"test.h\"
++ enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ default:
++ return Unknown;
++ }
++}" test.c
++testfileequal "#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum PerfectKey {
++ Unknown = -1,
++};
++ enum PerfectKey PerfectHash(const char *string, size_t length);
++#endif /* TRIE_HASH_PerfectHash */" test.h
++
++
++# Check the C code generator
++testsuccess triehash -C test.c -H test.c /dev/stdin
++testfileequal "#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum PerfectKey {
++ VeryLongWord = 43,
++ Word = 42,
++ Word___0 = 0,
++ Label = 44,
++ Unknown = -9,
++};
++static enum PerfectKey PerfectHash(const char *string, size_t length);
++#ifdef __GNUC__
++typedef uint16_t __attribute__((aligned (1))) triehash_uu16;
++typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1];
++typedef uint32_t __attribute__((aligned (1))) triehash_uu32;
++typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1];
++typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
++typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1];
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++#define onechar(c, s, l) (((uint64_t)(c)) << (s))
++#else
++#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))
++#endif
++#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)
++#define TRIE_HASH_MULTI_BYTE
++#endif
++#endif /*GNUC */
++#ifdef TRIE_HASH_MULTI_BYTE
++static enum PerfectKey PerfectHash4(const char *string)
++{
++ switch(*((triehash_uu32*) &string[0])) {
++ case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++ return Word;
++ }
++ return Unknown;
++}
++static enum PerfectKey PerfectHash5(const char *string)
++{
++ switch(*((triehash_uu32*) &string[0])) {
++ case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++ switch(string[4]) {
++ case 0| onechar('2', 0, 8):
++ return Label;
++ }
++ }
++ return Unknown;
++}
++static enum PerfectKey PerfectHash7(const char *string)
++{
++ switch(*((triehash_uu32*) &string[0])) {
++ case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++ switch(string[4]) {
++ case 0| onechar('-', 0, 8):
++ switch(string[5]) {
++ case 0| onechar('_', 0, 8):
++ switch(string[6]) {
++ case 0| onechar('0', 0, 8):
++ return Word___0;
++ }
++ }
++ }
++ }
++ return Unknown;
++}
++static enum PerfectKey PerfectHash12(const char *string)
++{
++ switch(*((triehash_uu64*) &string[0])) {
++ case 0| onechar('V', 0, 64)| onechar('e', 8, 64)| onechar('r', 16, 64)| onechar('y', 24, 64)| onechar('L', 32, 64)| onechar('o', 40, 64)| onechar('n', 48, 64)| onechar('g', 56, 64):
++ switch(*((triehash_uu32*) &string[8])) {
++ case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++ return VeryLongWord;
++ }
++ }
++ return Unknown;
++}
++#else
++static enum PerfectKey PerfectHash4(const char *string)
++{
++ switch(string[0]) {
++ case 'W':
++ switch(string[1]) {
++ case 'o':
++ switch(string[2]) {
++ case 'r':
++ switch(string[3]) {
++ case 'd':
++ return Word;
++ }
++ }
++ }
++ }
++ return Unknown;
++}
++static enum PerfectKey PerfectHash5(const char *string)
++{
++ switch(string[0]) {
++ case 'W':
++ switch(string[1]) {
++ case 'o':
++ switch(string[2]) {
++ case 'r':
++ switch(string[3]) {
++ case 'd':
++ switch(string[4]) {
++ case '2':
++ return Label;
++ }
++ }
++ }
++ }
++ }
++ return Unknown;
++}
++static enum PerfectKey PerfectHash7(const char *string)
++{
++ switch(string[0]) {
++ case 'W':
++ switch(string[1]) {
++ case 'o':
++ switch(string[2]) {
++ case 'r':
++ switch(string[3]) {
++ case 'd':
++ switch(string[4]) {
++ case '-':
++ switch(string[5]) {
++ case '_':
++ switch(string[6]) {
++ case '0':
++ return Word___0;
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ return Unknown;
++}
++static enum PerfectKey PerfectHash12(const char *string)
++{
++ switch(string[0]) {
++ case 'V':
++ switch(string[1]) {
++ case 'e':
++ switch(string[2]) {
++ case 'r':
++ switch(string[3]) {
++ case 'y':
++ switch(string[4]) {
++ case 'L':
++ switch(string[5]) {
++ case 'o':
++ switch(string[6]) {
++ case 'n':
++ switch(string[7]) {
++ case 'g':
++ switch(string[8]) {
++ case 'W':
++ switch(string[9]) {
++ case 'o':
++ switch(string[10]) {
++ case 'r':
++ switch(string[11]) {
++ case 'd':
++ return VeryLongWord;
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ return Unknown;
++}
++#endif /* TRIE_HASH_MULTI_BYTE */
++static enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ case 4:
++ return PerfectHash4(string);
++ case 5:
++ return PerfectHash5(string);
++ case 7:
++ return PerfectHash7(string);
++ case 12:
++ return PerfectHash12(string);
++ default:
++ return Unknown;
++ }
++}
++#endif /* TRIE_HASH_PerfectHash */" test.c
--- /dev/null
--- /dev/null
++#!/bin/sh
++. $(dirname $(readlink -f $0))/framework.sh
++
++WORDS="Halllo\nH-lllo\nHalll1"
++
++# Case-insensitive test
++testsuccessequal "#include \"/dev/null\"
++#ifdef __GNUC__
++typedef uint16_t __attribute__((aligned (1))) triehash_uu16;
++typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1];
++typedef uint32_t __attribute__((aligned (1))) triehash_uu32;
++typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1];
++typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
++typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1];
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++#define onechar(c, s, l) (((uint64_t)(c)) << (s))
++#else
++#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))
++#endif
++#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)
++#define TRIE_HASH_MULTI_BYTE
++#endif
++#endif /*GNUC */
++#ifdef TRIE_HASH_MULTI_BYTE
++static enum PerfectKey PerfectHash6(const char *string)
++{
++ switch(string[0] | 0x20) {
++ case 0| onechar('h', 0, 8):
++ switch(string[1]) {
++ case 0| onechar('-', 0, 8):
++ switch(*((triehash_uu32*) &string[2]) | 0x20202020) {
++ case 0| onechar('l', 0, 32)| onechar('l', 8, 32)| onechar('l', 16, 32)| onechar('o', 24, 32):
++ return H_lllo;
++ }
++ break;
++ case 0| onechar('a', 0, 8):
++ case 0| onechar('A', 0, 8):
++ switch(*((triehash_uu16*) &string[2]) | 0x2020) {
++ case 0| onechar('l', 0, 16)| onechar('l', 8, 16):
++ switch(string[4] | 0x20) {
++ case 0| onechar('l', 0, 8):
++ switch(string[5]) {
++ case 0| onechar('1', 0, 8):
++ return Halll1;
++ break;
++ case 0| onechar('o', 0, 8):
++ case 0| onechar('O', 0, 8):
++ return Halllo;
++ }
++ }
++ }
++ }
++ }
++ return Unknown;
++}
++#else
++static enum PerfectKey PerfectHash6(const char *string)
++{
++ switch(string[0] | 0x20) {
++ case 'h':
++ switch(string[1]) {
++ case '-':
++ switch(string[2] | 0x20) {
++ case 'l':
++ switch(string[3] | 0x20) {
++ case 'l':
++ switch(string[4] | 0x20) {
++ case 'l':
++ switch(string[5] | 0x20) {
++ case 'o':
++ return H_lllo;
++ }
++ }
++ }
++ }
++ break;
++ case 'a':
++ case 'A':
++ switch(string[2] | 0x20) {
++ case 'l':
++ switch(string[3] | 0x20) {
++ case 'l':
++ switch(string[4] | 0x20) {
++ case 'l':
++ switch(string[5]) {
++ case '1':
++ return Halll1;
++ break;
++ case 'o':
++ case 'O':
++ return Halllo;
++ }
++ }
++ }
++ }
++ }
++ }
++ return Unknown;
++}
++#endif /* TRIE_HASH_MULTI_BYTE */
++ enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ case 6:
++ return PerfectHash6(string);
++ default:
++ return Unknown;
++ }
++}" triehash --multi-byte=3210 --ignore-case -H /dev/null /dev/stdin
--- /dev/null
--- /dev/null
++#!/bin/sh
++. $(dirname $(readlink -f $0))/framework.sh
++
++# Need a short word, we really just need to check if the labels work
++WORDS=w
++
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++#include <foo.h>
++enum PerfectKey {
++ w = 0,
++ Unknown = -1,
++};
++ enum PerfectKey PerfectHash(const char *string, size_t length);
++#endif /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 -C /dev/null --include="<foo.h>" /dev/stdin
++
++# Check for --enum-class support
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum class PerfectKey {
++ w = 0,
++ Unknown = -1,
++};
++static enum PerfectKey PerfectHash(const char *string, size_t length);
++static enum PerfectKey PerfectHash1(const char *string)
++{
++ switch(string[0]) {
++ case 'w':
++ return PerfectKey::w;
++ }
++ return PerfectKey::Unknown;
++}
++static enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ case 1:
++ return PerfectHash1(string);
++ default:
++ return PerfectKey::Unknown;
++ }
++}
++#endif /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 --enum-class /dev/stdin
++
++# Check for --enum-name support
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum Foo {
++ Unknown = -1,
++};
++static enum Foo PerfectHash(const char *string, size_t length);
++static enum Foo PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ default:
++ return Unknown;
++ }
++}
++#endif /* TRIE_HASH_PerfectHash */\
++" triehash --multi-byte=0 --enum-name="Foo"
++
++# Check for --enum-class support
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum class Foo::Bar {
++ Unknown = -1,
++};
++static enum Foo::Bar PerfectHash(const char *string, size_t length);
++static enum Foo::Bar PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ default:
++ return Foo::Bar::Unknown;
++ }
++}
++#endif /* TRIE_HASH_PerfectHash */\
++" triehash --multi-byte=0 --enum-class --enum-name="Foo::Bar"
++
++# Check for --function-name support
++testsuccessequal "\
++#ifndef TRIE_HASH_NonSense
++#define TRIE_HASH_NonSense
++#include <stddef.h>
++#include <stdint.h>
++enum PerfectKey {
++ Unknown = -1,
++};
++static enum PerfectKey NonSense(const char *string, size_t length);
++static enum PerfectKey NonSense(const char *string, size_t length)
++{
++ switch (length) {
++ default:
++ return Unknown;
++ }
++}
++#endif /* TRIE_HASH_NonSense */\
++" triehash --multi-byte=0 --function-name="NonSense"
++
++# Check for --counter-name support
++testsuccessequal "\
++#ifndef TRIE_HASH_PerfectHash
++#define TRIE_HASH_PerfectHash
++#include <stddef.h>
++#include <stdint.h>
++enum { MyCounter = 0 };
++enum PerfectKey {
++ Unknown = -1,
++};
++static enum PerfectKey PerfectHash(const char *string, size_t length);
++static enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ default:
++ return Unknown;
++ }
++}
++#endif /* TRIE_HASH_PerfectHash */\
++" triehash --multi-byte=0 --counter-name="MyCounter"
--- /dev/null
--- /dev/null
++#!/bin/sh
++. $(dirname $(readlink -f $0))/framework.sh
++
++# Check that building a single-byte trie works
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│ Initial trie │
++└────────────────────────────────────────────────────┘
++
++├── V
++│ ├── e
++│ │ ├── r
++│ │ │ ├── y
++│ │ │ │ ├── L
++│ │ │ │ │ ├── o
++│ │ │ │ │ │ ├── n
++│ │ │ │ │ │ │ ├── g
++│ │ │ │ │ │ │ │ ├── W
++│ │ │ │ │ │ │ │ │ ├── o
++│ │ │ │ │ │ │ │ │ │ ├── r
++│ │ │ │ │ │ │ │ │ │ │ ├── d → VeryLongWord
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d → Word
++│ │ │ │ ├── -
++│ │ │ │ │ ├── _
++│ │ │ │ │ │ ├── 0 → Word-_0
++│ │ │ │ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Rebuilt trie │
++└────────────────────────────────────────────────────┘
++
++├── V
++│ ├── e
++│ │ ├── r
++│ │ │ ├── y
++│ │ │ │ ├── L
++│ │ │ │ │ ├── o
++│ │ │ │ │ │ ├── n
++│ │ │ │ │ │ │ ├── g
++│ │ │ │ │ │ │ │ ├── W
++│ │ │ │ │ │ │ │ │ ├── o
++│ │ │ │ │ │ │ │ │ │ ├── r
++│ │ │ │ │ │ │ │ │ │ │ ├── d → VeryLongWord
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d → Word
++│ │ │ │ ├── -
++│ │ │ │ │ ├── _
++│ │ │ │ │ │ ├── 0 → Word-_0
++│ │ │ │ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 4 │
++└────────────────────────────────────────────────────┘
++
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d → Word
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 5 │
++└────────────────────────────────────────────────────┘
++
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d
++│ │ │ │ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 7 │
++└────────────────────────────────────────────────────┘
++
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d
++│ │ │ │ ├── -
++│ │ │ │ │ ├── _
++│ │ │ │ │ │ ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 12 │
++└────────────────────────────────────────────────────┘
++
++├── V
++│ ├── e
++│ │ ├── r
++│ │ │ ├── y
++│ │ │ │ ├── L
++│ │ │ │ │ ├── o
++│ │ │ │ │ │ ├── n
++│ │ │ │ │ │ │ ├── g
++│ │ │ │ │ │ │ │ ├── W
++│ │ │ │ │ │ │ │ │ ├── o
++│ │ │ │ │ │ │ │ │ │ ├── r
++│ │ │ │ │ │ │ │ │ │ │ ├── d → VeryLongWord" triehash --multi-byte=0 -l tree /dev/stdin
++
++# Two byte optimization
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│ Initial trie │
++└────────────────────────────────────────────────────┘
++
++├── Ve
++│ ├── ry
++│ │ ├── Lo
++│ │ │ ├── ng
++│ │ │ │ ├── Wo
++│ │ │ │ │ ├── rd → VeryLongWord
++├── Wo
++│ ├── rd → Word
++│ │ ├── -_
++│ │ │ ├── 0 → Word-_0
++│ │ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Rebuilt trie │
++└────────────────────────────────────────────────────┘
++
++├── Ve
++│ ├── ry
++│ │ ├── Lo
++│ │ │ ├── ng
++│ │ │ │ ├── Wo
++│ │ │ │ │ ├── rd → VeryLongWord
++├── Wo
++│ ├── rd → Word
++│ │ ├── -
++│ │ │ ├── _0 → Word-_0
++│ │ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 4 │
++└────────────────────────────────────────────────────┘
++
++├── Wo
++│ ├── rd → Word
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 5 │
++└────────────────────────────────────────────────────┘
++
++├── Wo
++│ ├── rd
++│ │ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 7 │
++└────────────────────────────────────────────────────┘
++
++├── Wo
++│ ├── rd
++│ │ ├── -_
++│ │ │ ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 12 │
++└────────────────────────────────────────────────────┘
++
++├── Ve
++│ ├── ry
++│ │ ├── Lo
++│ │ │ ├── ng
++│ │ │ │ ├── Wo
++│ │ │ │ │ ├── rd → VeryLongWord" triehash --multi-byte=1 -l tree /dev/stdin
++# Four byte optimization
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│ Initial trie │
++└────────────────────────────────────────────────────┘
++
++├── Very
++│ ├── Long
++│ │ ├── Word → VeryLongWord
++├── Word → Word
++│ ├── -
++│ │ ├── _
++│ │ │ ├── 0 → Word-_0
++│ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Rebuilt trie │
++└────────────────────────────────────────────────────┘
++
++├── Very
++│ ├── Long
++│ │ ├── Word → VeryLongWord
++├── Word → Word
++│ ├── -
++│ │ ├── _
++│ │ │ ├── 0 → Word-_0
++│ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 4 │
++└────────────────────────────────────────────────────┘
++
++├── Word → Word
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 5 │
++└────────────────────────────────────────────────────┘
++
++├── Word
++│ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 7 │
++└────────────────────────────────────────────────────┘
++
++├── Word
++│ ├── -
++│ │ ├── _
++│ │ │ ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 12 │
++└────────────────────────────────────────────────────┘
++
++├── Very
++│ ├── Long
++│ │ ├── Word → VeryLongWord" triehash --multi-byte=2 -l tree /dev/stdin
++# Eigh byte optimization
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│ Initial trie │
++└────────────────────────────────────────────────────┘
++
++├── VeryLong
++│ ├── W
++│ │ ├── o
++│ │ │ ├── r
++│ │ │ │ ├── d → VeryLongWord
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d → Word
++│ │ │ │ ├── -
++│ │ │ │ │ ├── _
++│ │ │ │ │ │ ├── 0 → Word-_0
++│ │ │ │ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Rebuilt trie │
++└────────────────────────────────────────────────────┘
++
++├── V
++│ ├── eryLongW
++│ │ ├── o
++│ │ │ ├── r
++│ │ │ │ ├── d → VeryLongWord
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d → Word
++│ │ │ │ ├── -
++│ │ │ │ │ ├── _
++│ │ │ │ │ │ ├── 0 → Word-_0
++│ │ │ │ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 4 │
++└────────────────────────────────────────────────────┘
++
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d → Word
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 5 │
++└────────────────────────────────────────────────────┘
++
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d
++│ │ │ │ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 7 │
++└────────────────────────────────────────────────────┘
++
++├── W
++│ ├── o
++│ │ ├── r
++│ │ │ ├── d
++│ │ │ │ ├── -
++│ │ │ │ │ ├── _
++│ │ │ │ │ │ ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 12 │
++└────────────────────────────────────────────────────┘
++
++├── VeryLong
++│ ├── W
++│ │ ├── o
++│ │ │ ├── r
++│ │ │ │ ├── d → VeryLongWord" triehash --multi-byte=3 -l tree /dev/stdin
++
++
++# Check that building a multi-byte trie works
++testsuccessequal "\
++┌────────────────────────────────────────────────────┐
++│ Initial trie │
++└────────────────────────────────────────────────────┘
++
++├── VeryLong
++│ ├── Word → VeryLongWord
++├── Word → Word
++│ ├── -
++│ │ ├── _
++│ │ │ ├── 0 → Word-_0
++│ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Rebuilt trie │
++└────────────────────────────────────────────────────┘
++
++├── Very
++│ ├── LongWord → VeryLongWord
++├── Word → Word
++│ ├── -
++│ │ ├── _
++│ │ │ ├── 0 → Word-_0
++│ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 4 │
++└────────────────────────────────────────────────────┘
++
++├── Word → Word
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 5 │
++└────────────────────────────────────────────────────┘
++
++├── Word
++│ ├── 2 → Label
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 7 │
++└────────────────────────────────────────────────────┘
++
++├── Word
++│ ├── -
++│ │ ├── _
++│ │ │ ├── 0 → Word-_0
++┌────────────────────────────────────────────────────┐
++│ Trie for words of length 12 │
++└────────────────────────────────────────────────────┘
++
++├── VeryLong
++│ ├── Word → VeryLongWord" triehash -l tree /dev/stdin
++
++
++###### CHANGE THE WORDS FOR THE FOLLOWING TESTS #######
++WORDS="Word"
++
++# Check that we are generating the proper multi-byte and fallback sessions
++testsuccessequal "#include \"/dev/null\"
++#ifdef __GNUC__
++typedef uint16_t __attribute__((aligned (1))) triehash_uu16;
++typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1];
++typedef uint32_t __attribute__((aligned (1))) triehash_uu32;
++typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1];
++typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
++typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1];
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++#define onechar(c, s, l) (((uint64_t)(c)) << (s))
++#else
++#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))
++#endif
++#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)
++#define TRIE_HASH_MULTI_BYTE
++#endif
++#endif /*GNUC */
++#ifdef TRIE_HASH_MULTI_BYTE
++static enum PerfectKey PerfectHash4(const char *string)
++{
++ switch(*((triehash_uu32*) &string[0])) {
++ case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
++ return Word;
++ }
++ return Unknown;
++}
++#else
++static enum PerfectKey PerfectHash4(const char *string)
++{
++ switch(string[0]) {
++ case 'W':
++ switch(string[1]) {
++ case 'o':
++ switch(string[2]) {
++ case 'r':
++ switch(string[3]) {
++ case 'd':
++ return Word;
++ }
++ }
++ }
++ }
++ return Unknown;
++}
++#endif /* TRIE_HASH_MULTI_BYTE */
++ enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ case 4:
++ return PerfectHash4(string);
++ default:
++ return Unknown;
++ }
++}" triehash -H /dev/null /dev/stdin
++
++
++# Check that we are generating no multi-byte session
++testsuccessequal "#include \"/dev/null\"
++static enum PerfectKey PerfectHash4(const char *string)
++{
++ switch(string[0]) {
++ case 'W':
++ switch(string[1]) {
++ case 'o':
++ switch(string[2]) {
++ case 'r':
++ switch(string[3]) {
++ case 'd':
++ return Word;
++ }
++ }
++ }
++ }
++ return Unknown;
++}
++ enum PerfectKey PerfectHash(const char *string, size_t length)
++{
++ switch (length) {
++ case 4:
++ return PerfectHash4(string);
++ default:
++ return Unknown;
++ }
++}" triehash --multi-byte=0 -H /dev/null /dev/stdin
--- /dev/null
--- /dev/null
++#!/usr/bin/perl -w
++#
++# Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org>
++#
++# Permission is hereby granted, free of charge, to any person obtaining a copy
++# of this software and associated documentation files (the "Software"), to deal
++# in the Software without restriction, including without limitation the rights
++# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++# copies of the Software, and to permit persons to whom the Software is
++# furnished to do so, subject to the following conditions:
++#
++# The above copyright notice and this permission notice shall be included in
++# all copies or substantial portions of the Software.
++#
++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++# THE SOFTWARE.
++
++
++=head1 NAME
++
++triehash - Generate a perfect hash function derived from a trie.
++
++=cut
++
++use strict;
++use warnings;
++use Getopt::Long;
++
++=head1 SYNOPSIS
++
++B<triehash> [S<I<option>>] [S<I<input file>>]
++
++=head1 DESCRIPTION
++
++triehash takes a list of words in input file and generates a function and
++an enumeration to describe the word
++
++=head1 INPUT FILE FORMAT
++
++The file consists of multiple lines of the form:
++
++ [label ~ ] word [= value]
++
++This maps word to value, and generates an enumeration with entries of the form:
++
++ label = value
++
++If I<label> is undefined, the word will be used, the minus character will be
++replaced by an underscore. If value is undefined it is counted upwards from
++the last value.
++
++There may also be one line of the format
++
++ [ label ~] = value
++
++Which defines the value to be used for non-existing keys. Note that this also
++changes default value for other keys, as for normal entries. So if you place
++
++ = 0
++
++at the beginning of the file, unknown strings map to 0, and the other strings
++map to values starting with 1. If label is not specified, the default is
++I<Unknown>.
++
++=head1 OPTIONS
++
++=over 4
++
++=item B<-C>I<.c file> B<--code>=I<.c file>
++
++Generate code in the given file.
++
++=item B<-H>I<header file> B<--header>=I<header file>
++
++Generate a header in the given file, containing a declaration of the hash
++function and an enumeration.
++
++=item B<--enum-name=>I<word>
++
++The name of the enumeration.
++
++=item B<--function-name=>I<word>
++
++The name of the function.
++
++=item B<--namespace=>I<name>
++
++Put the function and enum into a namespace (C++)
++
++=item B<--class=>I<name>
++
++Put the function and enum into a class (C++)
++
++=item B<--enum-class>
++
++Generate an enum class instead of an enum (C++)
++
++=item B<--counter-name=>I<name>
++
++Use I<name> for a counter that is set to the latest entry in the enumeration
+++ 1. This can be useful for defining array sizes.
++
++=item B<--extern-c>
++
++Wrap everything into an extern "C" block. Not compatible with the C++
++options, as a header with namespaces, classes, or enum classes is not
++valid C.
++
++=item B<--multi-byte>=I<value>
++
++Generate code reading multiple bytes at once. The value is a string of power
++of twos to enable. The default value is 320 meaning that 8, 4, and single byte
++reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you
++also want to allow 2-byte reads. 2-byte reads are disabled by default because
++they negatively affect performance on older Intel architectures.
++
++This generates code for both multiple bytes and single byte reads, but only
++enables the multiple byte reads of GNU C compatible compilers, as the following
++extensions are used:
++
++=over 8
++
++=item Byte-aligned integers
++
++We must be able to generate integers that are aligned to a single byte using:
++
++ typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
++
++=item Byte-order
++
++The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined.
++
++=back
++
++We forcefully disable multi-byte reads on platforms where the variable
++I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined,
++as there is a measurable overhead from emulating the unaligned reads on
++ARM.
++
++=item B<--language=>I<language>
++
++Generate a file in the specified language. Currently known are 'C' and 'tree',
++the latter generating a tree.
++
++=item B<--include=>I<header>
++
++Add the header to the include statements of the header file. The value must
++be surrounded by quotes or angle brackets for C code. May be specified multiple
++times.
++
++=back
++
++=cut
++
++my $unknown = -1;
++my $unknown_label = "Unknown";
++my $counter_start = 0;
++my $enum_name = "PerfectKey";
++my $function_name = "PerfectHash";
++my $enum_class = 0;
++
++my $code_name = "-";
++my $header_name = "-";
++my $code;
++my $header;
++my $ignore_case = 0;
++my $multi_byte = "320";
++my $language = 'C';
++my $counter_name = undef;
++my @includes = ();
++
++
++Getopt::Long::config('default',
++ 'bundling',
++ 'no_getopt_compat',
++ 'no_auto_abbrev',
++ 'permute',
++ 'auto_help');
++
++GetOptions ("code|C=s" => \$code_name,
++ "header|H=s" => \$header_name,
++ "function-name=s" => \$function_name,
++ "ignore-case" => \$ignore_case,
++ "enum-name=s" => \$enum_name,
++ "language|l=s" => \$language,
++ "multi-byte=s" => \$multi_byte,
++ "enum-class" => \$enum_class,
++ "include=s" => \@includes,
++ "counter-name=s" => \$counter_name)
++ or die("Could not parse options!");
++
++
++package Trie {
++
++ sub new {
++ my $class = shift;
++ my $self = {};
++ bless $self, $class;
++
++ $self->{children} = {};
++ $self->{value} = undef;
++ $self->{label} = undef;
++
++ return $self;
++ }
++
++ # Return the largest power of 2 smaller or equal to the argument
++ sub alignpower2 {
++ my ($self, $length) = @_;
++
++ return 8 if ($length >= 8 && $multi_byte =~ /3/);
++ return 4 if ($length >= 4 && $multi_byte =~ /2/);
++ return 2 if ($length >= 2 && $multi_byte =~ /1/);
++
++ return 1;
++ }
++
++ # Split the key into a head block and a tail
++ sub split_key {
++ my ($self, $key) = @_;
++ my $length = length $key;
++ my $split = $self->alignpower2($length);
++
++ return (substr($key, 0, $split), substr($key, $split));
++ }
++
++ sub insert {
++ my ($self, $key, $label, $value) = @_;
++
++ if (length($key) == 0) {
++ $self->{label} = $label;
++ $self->{value} = $value;
++ return;
++ }
++
++ my ($child, $tail) = $self->split_key($key);
++
++ $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child}));
++
++ $self->{children}{$child}->insert($tail, $label, $value);
++ }
++
++ sub filter_depth {
++ my ($self, $togo) = @_;
++
++ my $new = Trie->new;
++
++ if ($togo != 0) {
++ my $found = 0;
++ foreach my $key (sort keys %{$self->{children}}) {
++ if ($togo > length($key) || defined $self->{children}{$key}->{value}) {
++ my $child = $self->{children}{$key}->filter_depth($togo - length($key));
++
++ $new->{children}{$key}= $child if defined $child;
++ $found = 1 if defined $child;
++ }
++ }
++ return undef if (!$found);
++ } else {
++ $new->{value} = $self->{value};
++ $new->{label} = $self->{label};
++ }
++
++ return $new;
++ }
++
++ # Reinsert all value nodes into the specified $trie, prepending $prefix
++ # to their $paths.
++ sub reinsert_value_nodes_into {
++ my ($self, $trie, $prefix) = @_;
++
++ $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value});
++
++ foreach my $key (sort keys %{$self->{children}}) {
++ $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key);
++ }
++ }
++
++ # Find an earlier split due a an ambiguous character
++ sub find_ealier_split {
++ my ($self, $key) = @_;
++
++ if ($ignore_case) {
++ for my $i (0..length($key)-1) {
++ # If the key starts with an ambiguous character, we need to
++ # take only it. Otherwise, we need to take everything
++ # before the character.
++ return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1)));
++ }
++ }
++ return $self->alignpower2(length $key);
++ }
++
++ # Rebuild the trie, splitting at ambigous chars, and unifying key lengths
++ sub rebuild_tree {
++ my $self = shift;
++ # Determine if/where we need to split before an ambiguous character
++ my $new_split = 99999999999999999;
++ foreach my $key (sort keys %{$self->{children}}) {
++ my $special_length = $self->find_ealier_split($key);
++ $new_split = $special_length if ($special_length < $new_split);
++ }
++
++ # Start building a new uniform trie
++ my $newself = Trie->new;
++ $newself->{label} = $self->{label};
++ $newself->{value} = $self->{value};
++ $newself->{children} = {};
++
++ foreach my $key (sort keys %{$self->{children}}) {
++ my $head = substr($key, 0, $new_split);
++ my $tail = substr($key, $new_split);
++ # Rebuild the child node at $head, pushing $tail downwards
++ $newself->{children}{$head} //= Trie->new;
++ $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail);
++ # We took up to one special character of each key label. There might
++ # be more, so we need to rebuild recursively.
++ $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree();
++ }
++
++ return $newself;
++ }
++}
++
++# Code generator for C and C++
++package CCodeGen {
++ my $static = ($code_name eq $header_name) ? "static" : "";
++ my $enum_specifier = $enum_class ? "enum class" : "enum";
++
++ sub new {
++ my $class = shift;
++ my $self = {};
++ bless $self, $class;
++
++ return $self;
++ }
++
++ sub open_output {
++ my $self = shift;
++ if ($code_name ne "-") {
++ open($code, '>', $code_name) or die "Cannot open $code_name: $!" ;
++ } else {
++ $code = *STDOUT;
++ }
++ if($code_name eq $header_name) {
++ $header = $code;
++ } elsif ($header_name ne "-") {
++ open($header, '>', $header_name) or die "Cannot open $header_name: $!" ;
++ } else {
++ $header = *STDOUT;
++ }
++ }
++
++ sub word_to_label {
++ my ($class, $word) = @_;
++
++ $word =~ s/_/__/g;
++ $word =~ s/-/_/g;
++ return $word;
++ }
++
++ # Return a case label, by shifting and or-ing bytes in the word
++ sub case_label {
++ my ($self, $key) = @_;
++
++ return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte;
++
++ my $output = '0';
++
++ for my $i (0..length($key)-1) {
++ $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key));
++ }
++
++ return $output;
++ }
++
++ # Return an appropriate read instruction for $length bytes from $offset
++ sub switch_key {
++ my ($self, $offset, $length) = @_;
++
++ return "string[$offset]" if $length == 1;
++ return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8);
++ }
++
++ sub print_table {
++ my ($self, $trie, $fh, $indent, $index) = @_;
++ $indent //= 0;
++ $index //= 0;
++
++ if (defined $trie->{value}) {
++ printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
++ return;
++ }
++
++ # The difference between lowercase and uppercase alphabetical characters
++ # is that they have one bit flipped. If we have alphabetical characters
++ # in the search space, and the entire search space works fine if we
++ # always turn on the flip, just OR the character we are switching over
++ # with the bit.
++ my $want_use_bit = 0;
++ my $can_use_bit = 1;
++ my $key_length = 0;
++ foreach my $key (sort keys %{$trie->{children}}) {
++ $can_use_bit &= not main::ambiguous($key);
++ $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
++ $key_length = length($key);
++ }
++
++ if ($ignore_case && $can_use_bit && $want_use_bit) {
++ printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
++ } else {
++ printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
++ }
++
++ my $notfirst = 0;
++ foreach my $key (sort keys %{$trie->{children}}) {
++ if ($notfirst) {
++ printf $fh (" " x $indent . " break;\n");
++ }
++ if ($ignore_case) {
++ printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key)));
++ printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
++ } else {
++ printf $fh (" " x $indent . "case %s:\n", $self->case_label($key));
++ }
++
++ $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));
++
++ $notfirst=1;
++ }
++
++ printf $fh (" " x $indent . "}\n");
++ }
++
++ sub print_words {
++ my ($self, $trie, $fh, $indent, $sofar) = @_;
++
++ $indent //= 0;
++ $sofar //= "";
++
++
++ printf $fh (" " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value};
++
++ foreach my $key (sort keys %{$trie->{children}}) {
++ $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key);
++ }
++ }
++
++ sub print_functions {
++ my ($self, $trie, %lengths) = @_;
++ foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
++ print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n");
++ print $code ("{\n");
++ $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1);
++ printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
++ print $code ("}\n");
++ }
++ }
++
++ sub main {
++ my ($self, $trie, $num_values, %lengths) = @_;
++ print $header ("#ifndef TRIE_HASH_${function_name}\n");
++ print $header ("#define TRIE_HASH_${function_name}\n");
++ print $header ("#include <stddef.h>\n");
++ print $header ("#include <stdint.h>\n");
++ foreach my $include (@includes) {
++ print $header ("#include $include\n");
++ }
++ printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name));
++ print $header ("${enum_specifier} ${enum_name} {\n");
++ $self->print_words($trie, $header, 1);
++ printf $header (" $unknown_label = $unknown,\n");
++ print $header ("};\n");
++ print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n");
++
++ print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name);
++
++ if ($multi_byte) {
++ print $code ("#ifdef __GNUC__\n");
++ for (my $i=16; $i <= 64; $i *= 2) {
++ print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n");
++ print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n");
++ }
++
++ print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n");
++ print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n");
++ print $code ("#else\n");
++ print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n");
++ print $code ("#endif\n");
++ print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n");
++ print $code ("#define TRIE_HASH_MULTI_BYTE\n");
++ print $code ("#endif\n");
++ print $code ("#endif /*GNUC */\n");
++
++ print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n");
++ $self->print_functions($trie, %lengths);
++ $multi_byte = 0;
++ print $code ("#else\n");
++ $self->print_functions($trie, %lengths);
++ print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n");
++ } else {
++ $self->print_functions($trie, %lengths);
++ }
++
++ print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n");
++ print $code ("{\n");
++ print $code (" switch (length) {\n");
++ foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
++ print $code (" case $local_length:\n");
++ print $code (" return ${function_name}${local_length}(string);\n");
++ }
++ print $code (" default:\n");
++ printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
++ print $code (" }\n");
++ print $code ("}\n");
++
++ # Print end of header here, in case header and code point to the same file
++ print $header ("#endif /* TRIE_HASH_${function_name} */\n");
++ }
++}
++
++# Check if the word can be reached by exactly one word in (alphabet OR 0x20).
++sub ambiguous {
++ my $word = shift;
++
++ foreach my $char (split //, $word) {
++ # Setting the lowercase flag in the character produces a different
++ # character, the character would thus not be matched.
++ return 1 if ((ord($char) | 0x20) != ord(lc($char)));
++
++ # A word is also ambiguous if any character in lowercase can be reached
++ # by ORing 0x20 from another character in the charset that is not a
++ # lowercase character of the current character.
++ # Assume that we have UTF-8 and the most significant bit can be set
++ for my $i (0..255) {
++ return 1 if (($i | 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char));
++ }
++ }
++
++ return 0;
++}
++
++sub build_trie {
++ my $codegen = shift;
++ my $trie = Trie->new;
++
++ my $counter = $counter_start;
++ my %lengths;
++
++ open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!";
++ while (my $line = <$input>) {
++ my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/;
++
++ if (defined $word) {
++ $counter = $value if defined($value);
++ $label //= $codegen->word_to_label($word);
++
++ $trie->insert($word, $label, $counter);
++ $lengths{length($word)} = 1;
++ $counter++;
++ } elsif (defined $value) {
++ $unknown = $value;
++ $unknown_label = $label if defined($label);
++ $counter = $value + 1;
++ } else {
++ die "Invalid line: $line";
++ }
++ }
++
++ return ($trie, $counter, %lengths);
++}
++
++# Generates an ASCII art tree
++package TreeCodeGen {
++
++ sub new {
++ my $class = shift;
++ my $self = {};
++ bless $self, $class;
++
++ return $self;
++ }
++
++ sub word_to_label {
++ my ($self, $word) = @_;
++ return $word;
++ }
++
++ sub main {
++ my ($self, $trie, $counter, %lengths) = @_;
++ printf $code ("┌────────────────────────────────────────────────────┐\n");
++ printf $code ("│ Initial trie │\n");
++ printf $code ("└────────────────────────────────────────────────────┘\n");
++ $self->print($trie);
++ printf $code ("┌────────────────────────────────────────────────────┐\n");
++ printf $code ("│ Rebuilt trie │\n");
++ printf $code ("└────────────────────────────────────────────────────┘\n");
++ $self->print($trie->rebuild_tree());
++
++ foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
++ printf $code ("┌────────────────────────────────────────────────────┐\n");
++ printf $code ("│ Trie for words of length %-4d │\n", $local_length);
++ printf $code ("└────────────────────────────────────────────────────┘\n");
++ $self->print($trie->filter_depth($local_length)->rebuild_tree());
++ }
++ }
++
++ sub open_output {
++ my $self = shift;
++ if ($code_name ne "-") {
++ open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ;
++ } else {
++ $code = *STDOUT;
++ }
++ }
++
++ # Print a trie
++ sub print {
++ my ($self, $trie, $depth) = @_;
++ $depth //= 0;
++
++ print $code (" → ") if defined($trie->{label});
++ print $code ($trie->{label} // "", "\n");
++ foreach my $key (sort keys %{$trie->{children}}) {
++ print $code ("│ " x ($depth), "├── $key");
++ $self->print($trie->{children}{$key}, $depth + 1);
++ }
++ }
++}
++
++my %codegens = (
++ C => "CCodeGen",
++ tree => "TreeCodeGen",
++);
++
++
++defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens);
++my $codegen = $codegens{$language}->new();
++my ($trie, $counter, %lengths) = build_trie($codegen);
++
++$codegen->open_output();
++$codegen->main($trie, $counter, %lengths);
++
++
++=head1 LICENSE
++
++triehash is available under the MIT/Expat license, see the source code
++for more information.
++
++=head1 AUTHOR
++
++Julian Andres Klode <jak@jak-linux.org>
++
++=cut
++