2 ***********************************************************************
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
5 ***********************************************************************
6 ***********************************************************************
7 * Copyright (C) 2010-2014, International Business Machines
8 * Corporation and others. All Rights Reserved.
9 ***********************************************************************
10 * file name: dicttrieperf.cpp
12 * tab size: 8 (not used)
15 * created on: 2010dec09
16 * created by: Markus W. Scherer
18 * Performance test program for dictionary-type tries.
20 * Usage from within <ICU build tree>/test/perf/dicttrieperf/ :
23 * export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw
24 * ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000
26 * ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250
31 #include "unicode/bytestrie.h"
32 #include "unicode/bytestriebuilder.h"
33 #include "unicode/localpointer.h"
34 #include "unicode/ucharstrie.h"
35 #include "unicode/ucharstriebuilder.h"
36 #include "unicode/uperf.h"
37 #include "unicode/utext.h"
41 #include "ucbuf.h" // struct ULine
44 #include "cmemory.h" // for UPRV_LENGTHOF
47 class DictionaryTriePerfTest
: public UPerfTest
{
49 DictionaryTriePerfTest(int32_t argc
, const char *argv
[], UErrorCode
&status
)
50 : UPerfTest(argc
, argv
, NULL
, 0, "", status
), numTextLines(0) {
53 for(int32_t i
=0; i
<numLines
; ++i
) {
54 // Skip comment lines (start with a character below 'A').
55 if(lines
[i
].name
[0]>=0x41) {
57 // Remove trailing CR LF.
58 int32_t len
=lines
[i
].len
;
60 while(len
>0 && ((c
=lines
[i
].name
[len
-1])==0xa || c
==0xd)) {
69 virtual UPerfFunction
*runIndexedTest(int32_t index
, UBool exec
, const char *&name
, char *par
=NULL
);
71 const char *getSourceDir() const { return sourceDir
; }
73 UBool
hasFile() const { return ucharBuf
!=NULL
; }
74 const ULine
*getCachedLines() const { return lines
; }
75 int32_t getNumLines() const { return numLines
; }
76 int32_t numTextLines
; // excluding comment lines
79 // Performance test function object.
80 // Loads icudt46l.dat (or whatever its current versioned filename)
81 // from the -s or --sourcedir path.
82 class PackageLookup
: public UPerfFunction
{
84 PackageLookup(const DictionaryTriePerfTest
&perf
) {
85 IcuToolErrorCode
errorCode("PackageLookup()");
86 CharString
filename(perf
.getSourceDir(), errorCode
);
87 int32_t filenameLength
=filename
.length();
88 if(filenameLength
>0 && filename
[filenameLength
-1]!=U_FILE_SEP_CHAR
&&
89 filename
[filenameLength
-1]!=U_FILE_ALT_SEP_CHAR
) {
90 filename
.append(U_FILE_SEP_CHAR
, errorCode
);
92 filename
.append(U_ICUDATA_NAME
, errorCode
);
93 filename
.append(".dat", errorCode
);
94 pkg
.readPackage(filename
.data());
98 virtual ~PackageLookup() {}
100 // virtual void call(UErrorCode* pErrorCode) { ... }
102 virtual long getOperationsPerIteration() {
103 return pkg
.getItemCount();
106 // virtual long getEventsPerIteration();
113 int32_t nameOffset
, dataOffset
;
116 // Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c).
117 static int32_t simpleBinarySearch(const char *s
, const char *names
, const TOCEntry
*toc
, int32_t count
) {
120 int32_t lastNumber
=limit
;
122 int32_t number
=(start
+limit
)/2;
123 if(lastNumber
==number
) { // have we moved?
124 return -1; // not found
127 int32_t cmp
=strcmp(s
, names
+toc
[number
].nameOffset
);
138 class BinarySearchPackageLookup
: public PackageLookup
{
140 BinarySearchPackageLookup(const DictionaryTriePerfTest
&perf
)
141 : PackageLookup(perf
) {
142 IcuToolErrorCode
errorCode("BinarySearchPackageLookup()");
143 int32_t count
=pkg
.getItemCount();
144 toc
=new TOCEntry
[count
];
145 for(int32_t i
=0; i
<count
; ++i
) {
146 toc
[i
].nameOffset
=itemNames
.length();
147 toc
[i
].dataOffset
=i
; // arbitrary value, see toc comment below
148 // The Package class removes the "icudt46l/" prefix.
149 // We restore that here for a fair performance test.
150 const char *name
=pkg
.getItem(i
)->name
;
151 itemNames
.append("icudt46l/", errorCode
);
152 itemNames
.append(name
, strlen(name
)+1, errorCode
);
154 printf("size of item names: %6ld\n", (long)itemNames
.length());
155 printf("size of TOC: %6ld\n", (long)(count
*8));
156 printf("total index size: %6ld\n", (long)(itemNames
.length()+count
*8));
158 virtual ~BinarySearchPackageLookup() {
162 virtual void call(UErrorCode
* /*pErrorCode*/) {
163 int32_t count
=pkg
.getItemCount();
164 const char *itemNameChars
=itemNames
.data();
165 const char *name
=itemNameChars
;
166 for(int32_t i
=0; i
<count
; ++i
) {
167 if(simpleBinarySearch(name
, itemNameChars
, toc
, count
)<0) {
168 fprintf(stderr
, "item not found: %s\n", name
);
170 name
=strchr(name
, 0)+1;
175 CharString itemNames
;
176 // toc imitates a .dat file's array of UDataOffsetTOCEntry
177 // with nameOffset and dataOffset.
178 // We don't need the dataOffsets, but we want to imitate the real
179 // memory density, to measure equivalent CPU cache usage.
184 #define MIN(a,b) (((a)<(b)) ? (a) : (b))
187 // Compare strings where we know the shared prefix length,
188 // and advance the prefix length as we find that the strings share even more characters.
189 static int32_t strcmpAfterPrefix(const char *s1
, const char *s2
, int32_t &prefixLength
) {
190 int32_t pl
=prefixLength
;
195 int32_t c1
=(uint8_t)*s1
++;
196 int32_t c2
=(uint8_t)*s2
++;
198 if(cmp
!=0 || c1
==0) { // different or done
201 ++pl
; // increment shared same-prefix length
207 static int32_t prefixBinarySearch(const char *s
, const char *names
, const TOCEntry
*toc
, int32_t count
) {
213 // Remember the shared prefix between s, start and limit,
214 // and don't compare that shared prefix again.
215 // The shared prefix should get longer as we narrow the [start, limit[ range.
216 int32_t startPrefixLength
=0;
217 int32_t limitPrefixLength
=0;
218 // Prime the prefix lengths so that we don't keep prefixLength at 0 until
219 // both the start and limit indexes have moved.
220 // At the same time, we find if s is one of the start and (limit-1) names,
221 // and if not, exclude them from the actual binary search.
222 if(0==strcmpAfterPrefix(s
, names
+toc
[0].nameOffset
, startPrefixLength
)) {
227 if(0==strcmpAfterPrefix(s
, names
+toc
[limit
].nameOffset
, limitPrefixLength
)) {
231 int32_t i
=(start
+limit
)/2;
232 int32_t prefixLength
=MIN(startPrefixLength
, limitPrefixLength
);
233 int32_t cmp
=strcmpAfterPrefix(s
, names
+toc
[i
].nameOffset
, prefixLength
);
236 limitPrefixLength
=prefixLength
;
241 startPrefixLength
=prefixLength
;
247 class PrefixBinarySearchPackageLookup
: public BinarySearchPackageLookup
{
249 PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest
&perf
)
250 : BinarySearchPackageLookup(perf
) {}
252 virtual void call(UErrorCode
* /*pErrorCode*/) {
253 int32_t count
=pkg
.getItemCount();
254 const char *itemNameChars
=itemNames
.data();
255 const char *name
=itemNameChars
;
256 for(int32_t i
=0; i
<count
; ++i
) {
257 if(prefixBinarySearch(name
, itemNameChars
, toc
, count
)<0) {
258 fprintf(stderr
, "item not found: %s\n", name
);
260 name
=strchr(name
, 0)+1;
265 static int32_t bytesTrieLookup(const char *s
, const char *nameTrieBytes
) {
266 BytesTrie
trie(nameTrieBytes
);
267 if(USTRINGTRIE_HAS_VALUE(trie
.next(s
, -1))) {
268 return trie
.getValue();
274 class BytesTriePackageLookup
: public PackageLookup
{
276 BytesTriePackageLookup(const DictionaryTriePerfTest
&perf
)
277 : PackageLookup(perf
) {
278 IcuToolErrorCode
errorCode("BinarySearchPackageLookup()");
279 builder
=new BytesTrieBuilder(errorCode
);
280 int32_t count
=pkg
.getItemCount();
281 for(int32_t i
=0; i
<count
; ++i
) {
282 // The Package class removes the "icudt46l/" prefix.
283 // We restore that here for a fair performance test.
284 // We store all full names so that we do not have to reconstruct them
285 // in the call() function.
286 const char *name
=pkg
.getItem(i
)->name
;
287 int32_t offset
=itemNames
.length();
288 itemNames
.append("icudt46l/", errorCode
);
289 itemNames
.append(name
, -1, errorCode
);
290 // As value, set the data item index.
291 // In a real implementation, we would use that to get the
292 // start and limit offset of the data item.
293 StringPiece
fullName(itemNames
.toStringPiece());
294 fullName
.remove_prefix(offset
);
295 builder
->add(fullName
, i
, errorCode
);
296 // NUL-terminate the name for call() to find the next one.
297 itemNames
.append(0, errorCode
);
299 int32_t length
=builder
->buildStringPiece(USTRINGTRIE_BUILD_SMALL
, errorCode
).length();
300 printf("size of BytesTrie: %6ld\n", (long)length
);
301 // count+1: +1 for the last-item limit offset which we should have always had
302 printf("size of dataOffsets:%6ld\n", (long)((count
+1)*4));
303 printf("total index size: %6ld\n", (long)(length
+(count
+1)*4));
305 virtual ~BytesTriePackageLookup() {
309 virtual void call(UErrorCode
*pErrorCode
) {
310 int32_t count
=pkg
.getItemCount();
311 const char *nameTrieBytes
=builder
->buildStringPiece(USTRINGTRIE_BUILD_SMALL
, *pErrorCode
).data();
312 const char *name
=itemNames
.data();
313 for(int32_t i
=0; i
<count
; ++i
) {
314 if(bytesTrieLookup(name
, nameTrieBytes
)<0) {
315 fprintf(stderr
, "item not found: %s\n", name
);
317 name
=strchr(name
, 0)+1;
322 BytesTrieBuilder
*builder
;
323 CharString itemNames
;
326 // Performance test function object.
327 // Each subclass loads a dictionary text file
328 // from the -s or --sourcedir path plus -f or --file-name.
329 // For example, <ICU source dir>/source/data/brkitr/thaidict.txt.
330 class DictLookup
: public UPerfFunction
{
332 DictLookup(const DictionaryTriePerfTest
&perfTest
) : perf(perfTest
) {}
334 virtual long getOperationsPerIteration() {
335 return perf
.numTextLines
;
339 const DictionaryTriePerfTest
&perf
;
342 // Closely imitate CompactTrieDictionary::matches().
343 // Note: CompactTrieDictionary::matches() is part of its trie implementation,
344 // and while it loops over the text, it knows the current state.
345 // By contrast, this implementation uses UCharsTrie API functions that have to
346 // check the trie state each time and load/store state in the object.
347 // (Whether it hasNext() and whether it is in the middle of a linear-match node.)
349 ucharsTrieMatches(UCharsTrie
&trie
,
350 UText
*text
, int32_t textLimit
,
351 int32_t *lengths
, int &count
, int limit
) {
352 UChar32 c
=utext_next32(text
);
354 // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
355 // b) It also ignores non-BMP code points by casting to UChar!
359 // Should be firstForCodePoint() but CompactTrieDictionary
360 // handles only code units.
361 UStringTrieResult result
=trie
.first(c
);
365 if(USTRINGTRIE_HAS_VALUE(result
)) {
367 // lengths[count++]=(int32_t)utext_getNativeIndex(text);
368 lengths
[count
++]=numChars
; // CompactTrieDictionary just counts chars too.
370 if(result
==USTRINGTRIE_FINAL_VALUE
) {
373 } else if(result
==USTRINGTRIE_NO_MATCH
) {
376 if(numChars
>=textLimit
) {
377 // Note: Why do we have both a text limit and a UText that knows its length?
380 UChar32 c
=utext_next32(text
);
382 // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
383 // b) It also ignores non-BMP code points by casting to UChar!
388 // Should be nextForCodePoint() but CompactTrieDictionary
389 // handles only code units.
393 // Note: CompactTrieDictionary::matches() comments say that it leaves the UText
394 // after the longest prefix match and returns the number of characters
395 // that were matched.
396 if(index
!=lastMatch
) {
397 utext_setNativeIndex(text
, lastMatch
);
399 return lastMatch
-start
;
400 // However, it does not do either of these, so I am not trying to
401 // imitate it (or its docs) 100%.
406 class UCharsTrieDictLookup
: public DictLookup
{
408 UCharsTrieDictLookup(const DictionaryTriePerfTest
&perfTest
)
409 : DictLookup(perfTest
), trie(NULL
) {
410 IcuToolErrorCode
errorCode("UCharsTrieDictLookup()");
411 builder
=new UCharsTrieBuilder(errorCode
);
412 const ULine
*lines
=perf
.getCachedLines();
413 int32_t numLines
=perf
.getNumLines();
414 for(int32_t i
=0; i
<numLines
; ++i
) {
415 // Skip comment lines (start with a character below 'A').
416 if(lines
[i
].name
[0]<0x41) {
419 builder
->add(UnicodeString(FALSE
, lines
[i
].name
, lines
[i
].len
), 0, errorCode
);
421 UnicodeString trieUChars
;
422 int32_t length
=builder
->buildUnicodeString(USTRINGTRIE_BUILD_SMALL
, trieUChars
, errorCode
).length();
423 printf("size of UCharsTrie: %6ld bytes\n", (long)length
*2);
424 trie
=builder
->build(USTRINGTRIE_BUILD_SMALL
, errorCode
);
427 virtual ~UCharsTrieDictLookup() {
433 UCharsTrieBuilder
*builder
;
437 class UCharsTrieDictMatches
: public UCharsTrieDictLookup
{
439 UCharsTrieDictMatches(const DictionaryTriePerfTest
&perfTest
)
440 : UCharsTrieDictLookup(perfTest
) {}
442 virtual void call(UErrorCode
*pErrorCode
) {
443 UText text
=UTEXT_INITIALIZER
;
445 const ULine
*lines
=perf
.getCachedLines();
446 int32_t numLines
=perf
.getNumLines();
447 for(int32_t i
=0; i
<numLines
; ++i
) {
448 // Skip comment lines (start with a character below 'A').
449 if(lines
[i
].name
[0]<0x41) {
452 utext_openUChars(&text
, lines
[i
].name
, lines
[i
].len
, pErrorCode
);
454 ucharsTrieMatches(*trie
, &text
, lines
[i
].len
,
455 lengths
, count
, UPRV_LENGTHOF(lengths
));
456 if(count
==0 || lengths
[count
-1]!=lines
[i
].len
) {
457 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
463 class UCharsTrieDictContains
: public UCharsTrieDictLookup
{
465 UCharsTrieDictContains(const DictionaryTriePerfTest
&perfTest
)
466 : UCharsTrieDictLookup(perfTest
) {}
468 virtual void call(UErrorCode
* /*pErrorCode*/) {
469 const ULine
*lines
=perf
.getCachedLines();
470 int32_t numLines
=perf
.getNumLines();
471 for(int32_t i
=0; i
<numLines
; ++i
) {
472 // Skip comment lines (which start with a character below 'A').
473 if(lines
[i
].name
[0]<0x41) {
476 if(!USTRINGTRIE_HAS_VALUE(trie
->reset().next(lines
[i
].name
, lines
[i
].len
))) {
477 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
483 static inline int32_t thaiCharToByte(UChar32 c
) {
484 if(0xe00<=c
&& c
<=0xefe) {
493 static UBool
thaiWordToBytes(const UChar
*s
, int32_t length
,
494 CharString
&str
, UErrorCode
&errorCode
) {
495 for(int32_t i
=0; i
<length
; ++i
) {
497 int32_t b
=thaiCharToByte(c
);
499 str
.append((char)b
, errorCode
);
501 fprintf(stderr
, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c
);
508 class BytesTrieDictLookup
: public DictLookup
{
510 BytesTrieDictLookup(const DictionaryTriePerfTest
&perfTest
)
511 : DictLookup(perfTest
), trie(NULL
), noDict(FALSE
) {
512 IcuToolErrorCode
errorCode("BytesTrieDictLookup()");
513 builder
=new BytesTrieBuilder(errorCode
);
515 const ULine
*lines
=perf
.getCachedLines();
516 int32_t numLines
=perf
.getNumLines();
517 for(int32_t i
=0; i
<numLines
; ++i
) {
518 // Skip comment lines (start with a character below 'A').
519 if(lines
[i
].name
[0]<0x41) {
522 if(!thaiWordToBytes(lines
[i
].name
, lines
[i
].len
, str
.clear(), errorCode
)) {
523 fprintf(stderr
, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i
);
527 builder
->add(str
.toStringPiece(), 0, errorCode
);
530 int32_t length
=builder
->buildStringPiece(USTRINGTRIE_BUILD_SMALL
, errorCode
).length();
531 printf("size of BytesTrie: %6ld bytes\n", (long)length
);
532 trie
=builder
->build(USTRINGTRIE_BUILD_SMALL
, errorCode
);
536 virtual ~BytesTrieDictLookup() {
542 BytesTrieBuilder
*builder
;
548 bytesTrieMatches(BytesTrie
&trie
,
549 UText
*text
, int32_t textLimit
,
550 int32_t *lengths
, int &count
, int limit
) {
551 UChar32 c
=utext_next32(text
);
555 UStringTrieResult result
=trie
.first(thaiCharToByte(c
));
559 if(USTRINGTRIE_HAS_VALUE(result
)) {
561 // lengths[count++]=(int32_t)utext_getNativeIndex(text);
562 lengths
[count
++]=numChars
; // CompactTrieDictionary just counts chars too.
564 if(result
==USTRINGTRIE_FINAL_VALUE
) {
567 } else if(result
==USTRINGTRIE_NO_MATCH
) {
570 if(numChars
>=textLimit
) {
573 UChar32 c
=utext_next32(text
);
578 result
=trie
.next(thaiCharToByte(c
));
583 class BytesTrieDictMatches
: public BytesTrieDictLookup
{
585 BytesTrieDictMatches(const DictionaryTriePerfTest
&perfTest
)
586 : BytesTrieDictLookup(perfTest
) {}
588 virtual void call(UErrorCode
*pErrorCode
) {
592 UText text
=UTEXT_INITIALIZER
;
594 const ULine
*lines
=perf
.getCachedLines();
595 int32_t numLines
=perf
.getNumLines();
596 for(int32_t i
=0; i
<numLines
; ++i
) {
597 // Skip comment lines (start with a character below 'A').
598 if(lines
[i
].name
[0]<0x41) {
601 utext_openUChars(&text
, lines
[i
].name
, lines
[i
].len
, pErrorCode
);
603 bytesTrieMatches(*trie
, &text
, lines
[i
].len
,
604 lengths
, count
, UPRV_LENGTHOF(lengths
));
605 if(count
==0 || lengths
[count
-1]!=lines
[i
].len
) {
606 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
612 class BytesTrieDictContains
: public BytesTrieDictLookup
{
614 BytesTrieDictContains(const DictionaryTriePerfTest
&perfTest
)
615 : BytesTrieDictLookup(perfTest
) {}
617 virtual void call(UErrorCode
* /*pErrorCode*/) {
621 const ULine
*lines
=perf
.getCachedLines();
622 int32_t numLines
=perf
.getNumLines();
623 for(int32_t i
=0; i
<numLines
; ++i
) {
624 const UChar
*line
=lines
[i
].name
;
625 // Skip comment lines (start with a character below 'A').
629 UStringTrieResult result
=trie
->first(thaiCharToByte(line
[0]));
630 int32_t lineLength
=lines
[i
].len
;
631 for(int32_t j
=1; j
<lineLength
; ++j
) {
632 if(!USTRINGTRIE_HAS_NEXT(result
)) {
633 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
636 result
=trie
->next(thaiCharToByte(line
[j
]));
638 if(!USTRINGTRIE_HAS_VALUE(result
)) {
639 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
645 UPerfFunction
*DictionaryTriePerfTest::runIndexedTest(int32_t index
, UBool exec
,
646 const char *&name
, char * /*par*/) {
650 name
="ucharstriematches";
652 return new UCharsTrieDictMatches(*this);
656 name
="ucharstriecontains";
658 return new UCharsTrieDictContains(*this);
662 name
="bytestriematches";
664 return new BytesTrieDictMatches(*this);
668 name
="bytestriecontains";
670 return new BytesTrieDictContains(*this);
678 if(index
==0 && exec
) {
679 puts("Running BytesTrie perf tests on the .dat package file from the --sourcedir.\n"
680 "For UCharsTrie perf tests on a dictionary text file, specify the -f or --file-name.\n");
684 name
="simplebinarysearch";
686 return new BinarySearchPackageLookup(*this);
690 name
="prefixbinarysearch";
692 return new PrefixBinarySearchPackageLookup(*this);
698 return new BytesTriePackageLookup(*this);
709 int main(int argc
, const char *argv
[]) {
710 IcuToolErrorCode
errorCode("dicttrieperf main()");
711 DictionaryTriePerfTest
test(argc
, argv
, errorCode
);
712 if(errorCode
.isFailure()) {
713 fprintf(stderr
, "DictionaryTriePerfTest() failed: %s\n", errorCode
.errorName());
715 return errorCode
.reset();
718 fprintf(stderr
, "FAILED: Tests could not be run, please check the arguments.\n");