]>
Commit | Line | Data |
---|---|---|
e2073b02 JAK |
1 | #!/usr/bin/perl -w |
2 | # | |
3 | # Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org> | |
4 | # | |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy | |
6 | # of this software and associated documentation files (the "Software"), to deal | |
7 | # in the Software without restriction, including without limitation the rights | |
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
9 | # copies of the Software, and to permit persons to whom the Software is | |
10 | # furnished to do so, subject to the following conditions: | |
11 | # | |
12 | # The above copyright notice and this permission notice shall be included in | |
13 | # all copies or substantial portions of the Software. | |
14 | # | |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
21 | # THE SOFTWARE. | |
22 | ||
23 | ||
24 | =head1 NAME | |
25 | ||
26 | triehash - Generate a perfect hash function derived from a trie. | |
27 | ||
28 | =cut | |
29 | ||
30 | use strict; | |
31 | use warnings; | |
32 | use Getopt::Long; | |
33 | ||
34 | =head1 SYNOPSIS | |
35 | ||
36 | B<triehash> [S<I<option>>] [S<I<input file>>] | |
37 | ||
38 | =head1 DESCRIPTION | |
39 | ||
40 | triehash takes a list of words in input file and generates a function and | |
41 | an enumeration to describe the word | |
42 | ||
43 | =head1 INPUT FILE FORMAT | |
44 | ||
45 | The file consists of multiple lines of the form: | |
46 | ||
47 | [label ~ ] word [= value] | |
48 | ||
49 | This maps word to value, and generates an enumeration with entries of the form: | |
50 | ||
51 | label = value | |
52 | ||
53 | If I<label> is undefined, the word will be used, the minus character will be | |
54 | replaced by an underscore. If value is undefined it is counted upwards from | |
55 | the last value. | |
56 | ||
57 | There may also be one line of the format | |
58 | ||
59 | [ label ~] = value | |
60 | ||
61 | Which defines the value to be used for non-existing keys. Note that this also | |
62 | changes default value for other keys, as for normal entries. So if you place | |
63 | ||
64 | = 0 | |
65 | ||
66 | at the beginning of the file, unknown strings map to 0, and the other strings | |
67 | map to values starting with 1. If label is not specified, the default is | |
68 | I<Unknown>. | |
69 | ||
70 | =head1 OPTIONS | |
71 | ||
72 | =over 4 | |
73 | ||
74 | =item B<-C>I<.c file> B<--code>=I<.c file> | |
75 | ||
76 | Generate code in the given file. | |
77 | ||
78 | =item B<-H>I<header file> B<--header>=I<header file> | |
79 | ||
80 | Generate a header in the given file, containing a declaration of the hash | |
81 | function and an enumeration. | |
82 | ||
83 | =item B<--enum-name=>I<word> | |
84 | ||
85 | The name of the enumeration. | |
86 | ||
87 | =item B<--function-name=>I<word> | |
88 | ||
89 | The name of the function. | |
90 | ||
91 | =item B<--namespace=>I<name> | |
92 | ||
93 | Put the function and enum into a namespace (C++) | |
94 | ||
95 | =item B<--class=>I<name> | |
96 | ||
97 | Put the function and enum into a class (C++) | |
98 | ||
99 | =item B<--enum-class> | |
100 | ||
101 | Generate an enum class instead of an enum (C++) | |
102 | ||
103 | =item B<--counter-name=>I<name> | |
104 | ||
105 | Use I<name> for a counter that is set to the latest entry in the enumeration | |
106 | + 1. This can be useful for defining array sizes. | |
107 | ||
108 | =item B<--extern-c> | |
109 | ||
110 | Wrap everything into an extern "C" block. Not compatible with the C++ | |
111 | options, as a header with namespaces, classes, or enum classes is not | |
112 | valid C. | |
113 | ||
114 | =item B<--multi-byte>=I<value> | |
115 | ||
116 | Generate code reading multiple bytes at once. The value is a string of power | |
117 | of twos to enable. The default value is 320 meaning that 8, 4, and single byte | |
118 | reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you | |
119 | also want to allow 2-byte reads. 2-byte reads are disabled by default because | |
120 | they negatively affect performance on older Intel architectures. | |
121 | ||
122 | This generates code for both multiple bytes and single byte reads, but only | |
123 | enables the multiple byte reads of GNU C compatible compilers, as the following | |
124 | extensions are used: | |
125 | ||
126 | =over 8 | |
127 | ||
128 | =item Byte-aligned integers | |
129 | ||
130 | We must be able to generate integers that are aligned to a single byte using: | |
131 | ||
132 | typedef uint64_t __attribute__((aligned (1))) triehash_uu64; | |
133 | ||
134 | =item Byte-order | |
135 | ||
136 | The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined. | |
137 | ||
138 | =back | |
139 | ||
140 | We forcefully disable multi-byte reads on platforms where the variable | |
141 | I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined, | |
142 | as there is a measurable overhead from emulating the unaligned reads on | |
143 | ARM. | |
144 | ||
145 | =item B<--language=>I<language> | |
146 | ||
147 | Generate a file in the specified language. Currently known are 'C' and 'tree', | |
148 | the latter generating a tree. | |
149 | ||
150 | =item B<--include=>I<header> | |
151 | ||
152 | Add the header to the include statements of the header file. The value must | |
153 | be surrounded by quotes or angle brackets for C code. May be specified multiple | |
154 | times. | |
155 | ||
156 | =back | |
157 | ||
158 | =cut | |
159 | ||
160 | my $unknown = -1; | |
161 | my $unknown_label = "Unknown"; | |
162 | my $counter_start = 0; | |
163 | my $enum_name = "PerfectKey"; | |
164 | my $function_name = "PerfectHash"; | |
165 | my $enum_class = 0; | |
166 | ||
167 | my $code_name = "-"; | |
168 | my $header_name = "-"; | |
169 | my $code; | |
170 | my $header; | |
171 | my $ignore_case = 0; | |
172 | my $multi_byte = "320"; | |
173 | my $language = 'C'; | |
174 | my $counter_name = undef; | |
175 | my @includes = (); | |
176 | ||
177 | ||
178 | Getopt::Long::config('default', | |
179 | 'bundling', | |
180 | 'no_getopt_compat', | |
181 | 'no_auto_abbrev', | |
182 | 'permute', | |
183 | 'auto_help'); | |
184 | ||
185 | GetOptions ("code|C=s" => \$code_name, | |
186 | "header|H=s" => \$header_name, | |
187 | "function-name=s" => \$function_name, | |
188 | "ignore-case" => \$ignore_case, | |
189 | "enum-name=s" => \$enum_name, | |
190 | "language|l=s" => \$language, | |
191 | "multi-byte=s" => \$multi_byte, | |
192 | "enum-class" => \$enum_class, | |
193 | "include=s" => \@includes, | |
194 | "counter-name=s" => \$counter_name) | |
195 | or die("Could not parse options!"); | |
196 | ||
197 | ||
198 | package Trie { | |
199 | ||
200 | sub new { | |
201 | my $class = shift; | |
202 | my $self = {}; | |
203 | bless $self, $class; | |
204 | ||
205 | $self->{children} = {}; | |
206 | $self->{value} = undef; | |
207 | $self->{label} = undef; | |
208 | ||
209 | return $self; | |
210 | } | |
211 | ||
212 | # Return the largest power of 2 smaller or equal to the argument | |
213 | sub alignpower2 { | |
214 | my ($self, $length) = @_; | |
215 | ||
216 | return 8 if ($length >= 8 && $multi_byte =~ /3/); | |
217 | return 4 if ($length >= 4 && $multi_byte =~ /2/); | |
218 | return 2 if ($length >= 2 && $multi_byte =~ /1/); | |
219 | ||
220 | return 1; | |
221 | } | |
222 | ||
223 | # Split the key into a head block and a tail | |
224 | sub split_key { | |
225 | my ($self, $key) = @_; | |
226 | my $length = length $key; | |
227 | my $split = $self->alignpower2($length); | |
228 | ||
229 | return (substr($key, 0, $split), substr($key, $split)); | |
230 | } | |
231 | ||
232 | sub insert { | |
233 | my ($self, $key, $label, $value) = @_; | |
234 | ||
235 | if (length($key) == 0) { | |
236 | $self->{label} = $label; | |
237 | $self->{value} = $value; | |
238 | return; | |
239 | } | |
240 | ||
241 | my ($child, $tail) = $self->split_key($key); | |
242 | ||
243 | $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child})); | |
244 | ||
245 | $self->{children}{$child}->insert($tail, $label, $value); | |
246 | } | |
247 | ||
248 | sub filter_depth { | |
249 | my ($self, $togo) = @_; | |
250 | ||
251 | my $new = Trie->new; | |
252 | ||
253 | if ($togo != 0) { | |
254 | my $found = 0; | |
255 | foreach my $key (sort keys %{$self->{children}}) { | |
256 | if ($togo > length($key) || defined $self->{children}{$key}->{value}) { | |
257 | my $child = $self->{children}{$key}->filter_depth($togo - length($key)); | |
258 | ||
259 | $new->{children}{$key}= $child if defined $child; | |
260 | $found = 1 if defined $child; | |
261 | } | |
262 | } | |
263 | return undef if (!$found); | |
264 | } else { | |
265 | $new->{value} = $self->{value}; | |
266 | $new->{label} = $self->{label}; | |
267 | } | |
268 | ||
269 | return $new; | |
270 | } | |
271 | ||
272 | # Reinsert all value nodes into the specified $trie, prepending $prefix | |
273 | # to their $paths. | |
274 | sub reinsert_value_nodes_into { | |
275 | my ($self, $trie, $prefix) = @_; | |
276 | ||
277 | $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value}); | |
278 | ||
279 | foreach my $key (sort keys %{$self->{children}}) { | |
280 | $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key); | |
281 | } | |
282 | } | |
283 | ||
284 | # Find an earlier split due a an ambiguous character | |
285 | sub find_ealier_split { | |
286 | my ($self, $key) = @_; | |
287 | ||
288 | if ($ignore_case) { | |
289 | for my $i (0..length($key)-1) { | |
290 | # If the key starts with an ambiguous character, we need to | |
291 | # take only it. Otherwise, we need to take everything | |
292 | # before the character. | |
293 | return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1))); | |
294 | } | |
295 | } | |
296 | return $self->alignpower2(length $key); | |
297 | } | |
298 | ||
299 | # Rebuild the trie, splitting at ambigous chars, and unifying key lengths | |
300 | sub rebuild_tree { | |
301 | my $self = shift; | |
302 | # Determine if/where we need to split before an ambiguous character | |
303 | my $new_split = 99999999999999999; | |
304 | foreach my $key (sort keys %{$self->{children}}) { | |
305 | my $special_length = $self->find_ealier_split($key); | |
306 | $new_split = $special_length if ($special_length < $new_split); | |
307 | } | |
308 | ||
309 | # Start building a new uniform trie | |
310 | my $newself = Trie->new; | |
311 | $newself->{label} = $self->{label}; | |
312 | $newself->{value} = $self->{value}; | |
313 | $newself->{children} = {}; | |
314 | ||
315 | foreach my $key (sort keys %{$self->{children}}) { | |
316 | my $head = substr($key, 0, $new_split); | |
317 | my $tail = substr($key, $new_split); | |
318 | # Rebuild the child node at $head, pushing $tail downwards | |
319 | $newself->{children}{$head} //= Trie->new; | |
320 | $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail); | |
321 | # We took up to one special character of each key label. There might | |
322 | # be more, so we need to rebuild recursively. | |
323 | $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree(); | |
324 | } | |
325 | ||
326 | return $newself; | |
327 | } | |
328 | } | |
329 | ||
330 | # Code generator for C and C++ | |
331 | package CCodeGen { | |
332 | my $static = ($code_name eq $header_name) ? "static" : ""; | |
333 | my $enum_specifier = $enum_class ? "enum class" : "enum"; | |
334 | ||
335 | sub new { | |
336 | my $class = shift; | |
337 | my $self = {}; | |
338 | bless $self, $class; | |
339 | ||
340 | return $self; | |
341 | } | |
342 | ||
343 | sub open_output { | |
344 | my $self = shift; | |
345 | if ($code_name ne "-") { | |
346 | open($code, '>', $code_name) or die "Cannot open $code_name: $!" ; | |
347 | } else { | |
348 | $code = *STDOUT; | |
349 | } | |
350 | if($code_name eq $header_name) { | |
351 | $header = $code; | |
352 | } elsif ($header_name ne "-") { | |
353 | open($header, '>', $header_name) or die "Cannot open $header_name: $!" ; | |
354 | } else { | |
355 | $header = *STDOUT; | |
356 | } | |
357 | } | |
358 | ||
359 | sub word_to_label { | |
360 | my ($class, $word) = @_; | |
361 | ||
362 | $word =~ s/_/__/g; | |
363 | $word =~ s/-/_/g; | |
364 | return $word; | |
365 | } | |
366 | ||
367 | # Return a case label, by shifting and or-ing bytes in the word | |
368 | sub case_label { | |
369 | my ($self, $key) = @_; | |
370 | ||
371 | return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte; | |
372 | ||
373 | my $output = '0'; | |
374 | ||
375 | for my $i (0..length($key)-1) { | |
376 | $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key)); | |
377 | } | |
378 | ||
379 | return $output; | |
380 | } | |
381 | ||
382 | # Return an appropriate read instruction for $length bytes from $offset | |
383 | sub switch_key { | |
384 | my ($self, $offset, $length) = @_; | |
385 | ||
386 | return "string[$offset]" if $length == 1; | |
387 | return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8); | |
388 | } | |
389 | ||
390 | sub print_table { | |
391 | my ($self, $trie, $fh, $indent, $index) = @_; | |
392 | $indent //= 0; | |
393 | $index //= 0; | |
394 | ||
395 | if (defined $trie->{value}) { | |
396 | printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label}); | |
397 | return; | |
398 | } | |
399 | ||
400 | # The difference between lowercase and uppercase alphabetical characters | |
401 | # is that they have one bit flipped. If we have alphabetical characters | |
402 | # in the search space, and the entire search space works fine if we | |
403 | # always turn on the flip, just OR the character we are switching over | |
404 | # with the bit. | |
405 | my $want_use_bit = 0; | |
406 | my $can_use_bit = 1; | |
407 | my $key_length = 0; | |
408 | foreach my $key (sort keys %{$trie->{children}}) { | |
409 | $can_use_bit &= not main::ambiguous($key); | |
410 | $want_use_bit |= ($key =~ /^[a-zA-Z]+$/); | |
411 | $key_length = length($key); | |
412 | } | |
413 | ||
414 | if ($ignore_case && $can_use_bit && $want_use_bit) { | |
415 | printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length); | |
416 | } else { | |
417 | printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length)); | |
418 | } | |
419 | ||
420 | my $notfirst = 0; | |
421 | foreach my $key (sort keys %{$trie->{children}}) { | |
422 | if ($notfirst) { | |
423 | printf $fh (" " x $indent . " break;\n"); | |
424 | } | |
425 | if ($ignore_case) { | |
426 | printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key))); | |
427 | printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit); | |
428 | } else { | |
429 | printf $fh (" " x $indent . "case %s:\n", $self->case_label($key)); | |
430 | } | |
431 | ||
432 | $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key)); | |
433 | ||
434 | $notfirst=1; | |
435 | } | |
436 | ||
437 | printf $fh (" " x $indent . "}\n"); | |
438 | } | |
439 | ||
440 | sub print_words { | |
441 | my ($self, $trie, $fh, $indent, $sofar) = @_; | |
442 | ||
443 | $indent //= 0; | |
444 | $sofar //= ""; | |
445 | ||
446 | ||
447 | printf $fh (" " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value}; | |
448 | ||
449 | foreach my $key (sort keys %{$trie->{children}}) { | |
450 | $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key); | |
451 | } | |
452 | } | |
453 | ||
454 | sub print_functions { | |
455 | my ($self, $trie, %lengths) = @_; | |
456 | foreach my $local_length (sort { $a <=> $b } (keys %lengths)) { | |
457 | print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n"); | |
458 | print $code ("{\n"); | |
459 | $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1); | |
460 | printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : "")); | |
461 | print $code ("}\n"); | |
462 | } | |
463 | } | |
464 | ||
465 | sub main { | |
466 | my ($self, $trie, $num_values, %lengths) = @_; | |
467 | print $header ("#ifndef TRIE_HASH_${function_name}\n"); | |
468 | print $header ("#define TRIE_HASH_${function_name}\n"); | |
469 | print $header ("#include <stddef.h>\n"); | |
470 | print $header ("#include <stdint.h>\n"); | |
471 | foreach my $include (@includes) { | |
472 | print $header ("#include $include\n"); | |
473 | } | |
474 | printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name)); | |
475 | print $header ("${enum_specifier} ${enum_name} {\n"); | |
476 | $self->print_words($trie, $header, 1); | |
477 | printf $header (" $unknown_label = $unknown,\n"); | |
478 | print $header ("};\n"); | |
479 | print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n"); | |
480 | ||
481 | print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name); | |
482 | ||
483 | if ($multi_byte) { | |
484 | print $code ("#ifdef __GNUC__\n"); | |
485 | for (my $i=16; $i <= 64; $i *= 2) { | |
486 | print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n"); | |
487 | print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n"); | |
488 | } | |
489 | ||
490 | print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n"); | |
491 | print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n"); | |
492 | print $code ("#else\n"); | |
493 | print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n"); | |
494 | print $code ("#endif\n"); | |
495 | print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n"); | |
496 | print $code ("#define TRIE_HASH_MULTI_BYTE\n"); | |
497 | print $code ("#endif\n"); | |
498 | print $code ("#endif /*GNUC */\n"); | |
499 | ||
500 | print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n"); | |
501 | $self->print_functions($trie, %lengths); | |
502 | $multi_byte = 0; | |
503 | print $code ("#else\n"); | |
504 | $self->print_functions($trie, %lengths); | |
505 | print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n"); | |
506 | } else { | |
507 | $self->print_functions($trie, %lengths); | |
508 | } | |
509 | ||
510 | print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n"); | |
511 | print $code ("{\n"); | |
512 | print $code (" switch (length) {\n"); | |
513 | foreach my $local_length (sort { $a <=> $b } (keys %lengths)) { | |
514 | print $code (" case $local_length:\n"); | |
515 | print $code (" return ${function_name}${local_length}(string);\n"); | |
516 | } | |
517 | print $code (" default:\n"); | |
518 | printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : "")); | |
519 | print $code (" }\n"); | |
520 | print $code ("}\n"); | |
521 | ||
522 | # Print end of header here, in case header and code point to the same file | |
523 | print $header ("#endif /* TRIE_HASH_${function_name} */\n"); | |
524 | } | |
525 | } | |
526 | ||
527 | # Check if the word can be reached by exactly one word in (alphabet OR 0x20). | |
528 | sub ambiguous { | |
529 | my $word = shift; | |
530 | ||
531 | foreach my $char (split //, $word) { | |
532 | # Setting the lowercase flag in the character produces a different | |
533 | # character, the character would thus not be matched. | |
534 | return 1 if ((ord($char) | 0x20) != ord(lc($char))); | |
535 | ||
536 | # A word is also ambiguous if any character in lowercase can be reached | |
537 | # by ORing 0x20 from another character in the charset that is not a | |
538 | # lowercase character of the current character. | |
539 | # Assume that we have UTF-8 and the most significant bit can be set | |
540 | for my $i (0..255) { | |
541 | return 1 if (($i | 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char)); | |
542 | } | |
543 | } | |
544 | ||
545 | return 0; | |
546 | } | |
547 | ||
548 | sub build_trie { | |
549 | my $codegen = shift; | |
550 | my $trie = Trie->new; | |
551 | ||
552 | my $counter = $counter_start; | |
553 | my %lengths; | |
554 | ||
555 | open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!"; | |
556 | while (my $line = <$input>) { | |
557 | my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/; | |
558 | ||
559 | if (defined $word) { | |
560 | $counter = $value if defined($value); | |
561 | $label //= $codegen->word_to_label($word); | |
562 | ||
563 | $trie->insert($word, $label, $counter); | |
564 | $lengths{length($word)} = 1; | |
565 | $counter++; | |
566 | } elsif (defined $value) { | |
567 | $unknown = $value; | |
568 | $unknown_label = $label if defined($label); | |
569 | $counter = $value + 1; | |
570 | } else { | |
571 | die "Invalid line: $line"; | |
572 | } | |
573 | } | |
574 | ||
575 | return ($trie, $counter, %lengths); | |
576 | } | |
577 | ||
578 | # Generates an ASCII art tree | |
579 | package TreeCodeGen { | |
580 | ||
581 | sub new { | |
582 | my $class = shift; | |
583 | my $self = {}; | |
584 | bless $self, $class; | |
585 | ||
586 | return $self; | |
587 | } | |
588 | ||
589 | sub word_to_label { | |
590 | my ($self, $word) = @_; | |
591 | return $word; | |
592 | } | |
593 | ||
594 | sub main { | |
595 | my ($self, $trie, $counter, %lengths) = @_; | |
596 | printf $code ("┌────────────────────────────────────────────────────┐\n"); | |
597 | printf $code ("│ Initial trie │\n"); | |
598 | printf $code ("└────────────────────────────────────────────────────┘\n"); | |
599 | $self->print($trie); | |
600 | printf $code ("┌────────────────────────────────────────────────────┐\n"); | |
601 | printf $code ("│ Rebuilt trie │\n"); | |
602 | printf $code ("└────────────────────────────────────────────────────┘\n"); | |
603 | $self->print($trie->rebuild_tree()); | |
604 | ||
605 | foreach my $local_length (sort { $a <=> $b } (keys %lengths)) { | |
606 | printf $code ("┌────────────────────────────────────────────────────┐\n"); | |
607 | printf $code ("│ Trie for words of length %-4d │\n", $local_length); | |
608 | printf $code ("└────────────────────────────────────────────────────┘\n"); | |
609 | $self->print($trie->filter_depth($local_length)->rebuild_tree()); | |
610 | } | |
611 | } | |
612 | ||
613 | sub open_output { | |
614 | my $self = shift; | |
615 | if ($code_name ne "-") { | |
616 | open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ; | |
617 | } else { | |
618 | $code = *STDOUT; | |
619 | } | |
620 | } | |
621 | ||
622 | # Print a trie | |
623 | sub print { | |
624 | my ($self, $trie, $depth) = @_; | |
625 | $depth //= 0; | |
626 | ||
627 | print $code (" → ") if defined($trie->{label}); | |
628 | print $code ($trie->{label} // "", "\n"); | |
629 | foreach my $key (sort keys %{$trie->{children}}) { | |
630 | print $code ("│ " x ($depth), "├── $key"); | |
631 | $self->print($trie->{children}{$key}, $depth + 1); | |
632 | } | |
633 | } | |
634 | } | |
635 | ||
636 | my %codegens = ( | |
637 | C => "CCodeGen", | |
638 | tree => "TreeCodeGen", | |
639 | ); | |
640 | ||
641 | ||
642 | defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens); | |
643 | my $codegen = $codegens{$language}->new(); | |
644 | my ($trie, $counter, %lengths) = build_trie($codegen); | |
645 | ||
646 | $codegen->open_output(); | |
647 | $codegen->main($trie, $counter, %lengths); | |
648 | ||
649 | ||
650 | =head1 LICENSE | |
651 | ||
652 | triehash is available under the MIT/Expat license, see the source code | |
653 | for more information. | |
654 | ||
655 | =head1 AUTHOR | |
656 | ||
657 | Julian Andres Klode <jak@jak-linux.org> | |
658 | ||
659 | =cut | |
660 |