2 **********************************************************************
3 * Copyright (C) 2010-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: dicttrieperf.cpp
8 * tab size: 8 (not used)
11 * created on: 2010dec09
12 * created by: Markus W. Scherer
14 * Performance test program for dictionary-type tries.
16 * Usage from within <ICU build tree>/test/perf/dicttrieperf/ :
19 * export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw
20 * ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000
22 * ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250
27 #include "unicode/bytestrie.h"
28 #include "unicode/bytestriebuilder.h"
29 #include "unicode/localpointer.h"
30 #include "unicode/ucharstrie.h"
31 #include "unicode/ucharstriebuilder.h"
32 #include "unicode/uperf.h"
33 #include "unicode/utext.h"
37 #include "ucbuf.h" // struct ULine
40 #include "cmemory.h" // for UPRV_LENGTHOF
43 class DictionaryTriePerfTest
: public UPerfTest
{
45 DictionaryTriePerfTest(int32_t argc
, const char *argv
[], UErrorCode
&status
)
46 : UPerfTest(argc
, argv
, NULL
, 0, "", status
), numTextLines(0) {
49 for(int32_t i
=0; i
<numLines
; ++i
) {
50 // Skip comment lines (start with a character below 'A').
51 if(lines
[i
].name
[0]>=0x41) {
53 // Remove trailing CR LF.
54 int32_t len
=lines
[i
].len
;
56 while(len
>0 && ((c
=lines
[i
].name
[len
-1])==0xa || c
==0xd)) {
65 virtual UPerfFunction
*runIndexedTest(int32_t index
, UBool exec
, const char *&name
, char *par
=NULL
);
67 const char *getSourceDir() const { return sourceDir
; }
69 UBool
hasFile() const { return ucharBuf
!=NULL
; }
70 const ULine
*getCachedLines() const { return lines
; }
71 int32_t getNumLines() const { return numLines
; }
72 int32_t numTextLines
; // excluding comment lines
75 // Performance test function object.
76 // Loads icudt46l.dat (or whatever its current versioned filename)
77 // from the -s or --sourcedir path.
78 class PackageLookup
: public UPerfFunction
{
80 PackageLookup(const DictionaryTriePerfTest
&perf
) {
81 IcuToolErrorCode
errorCode("PackageLookup()");
82 CharString
filename(perf
.getSourceDir(), errorCode
);
83 int32_t filenameLength
=filename
.length();
84 if(filenameLength
>0 && filename
[filenameLength
-1]!=U_FILE_SEP_CHAR
&&
85 filename
[filenameLength
-1]!=U_FILE_ALT_SEP_CHAR
) {
86 filename
.append(U_FILE_SEP_CHAR
, errorCode
);
88 filename
.append(U_ICUDATA_NAME
, errorCode
);
89 filename
.append(".dat", errorCode
);
90 pkg
.readPackage(filename
.data());
94 virtual ~PackageLookup() {}
96 // virtual void call(UErrorCode* pErrorCode) { ... }
98 virtual long getOperationsPerIteration() {
99 return pkg
.getItemCount();
102 // virtual long getEventsPerIteration();
109 int32_t nameOffset
, dataOffset
;
112 // Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c).
113 static int32_t simpleBinarySearch(const char *s
, const char *names
, const TOCEntry
*toc
, int32_t count
) {
116 int32_t lastNumber
=limit
;
118 int32_t number
=(start
+limit
)/2;
119 if(lastNumber
==number
) { // have we moved?
120 return -1; // not found
123 int32_t cmp
=strcmp(s
, names
+toc
[number
].nameOffset
);
134 class BinarySearchPackageLookup
: public PackageLookup
{
136 BinarySearchPackageLookup(const DictionaryTriePerfTest
&perf
)
137 : PackageLookup(perf
) {
138 IcuToolErrorCode
errorCode("BinarySearchPackageLookup()");
139 int32_t count
=pkg
.getItemCount();
140 toc
=new TOCEntry
[count
];
141 for(int32_t i
=0; i
<count
; ++i
) {
142 toc
[i
].nameOffset
=itemNames
.length();
143 toc
[i
].dataOffset
=i
; // arbitrary value, see toc comment below
144 // The Package class removes the "icudt46l/" prefix.
145 // We restore that here for a fair performance test.
146 const char *name
=pkg
.getItem(i
)->name
;
147 itemNames
.append("icudt46l/", errorCode
);
148 itemNames
.append(name
, strlen(name
)+1, errorCode
);
150 printf("size of item names: %6ld\n", (long)itemNames
.length());
151 printf("size of TOC: %6ld\n", (long)(count
*8));
152 printf("total index size: %6ld\n", (long)(itemNames
.length()+count
*8));
154 virtual ~BinarySearchPackageLookup() {
158 virtual void call(UErrorCode
* /*pErrorCode*/) {
159 int32_t count
=pkg
.getItemCount();
160 const char *itemNameChars
=itemNames
.data();
161 const char *name
=itemNameChars
;
162 for(int32_t i
=0; i
<count
; ++i
) {
163 if(simpleBinarySearch(name
, itemNameChars
, toc
, count
)<0) {
164 fprintf(stderr
, "item not found: %s\n", name
);
166 name
=strchr(name
, 0)+1;
171 CharString itemNames
;
172 // toc imitates a .dat file's array of UDataOffsetTOCEntry
173 // with nameOffset and dataOffset.
174 // We don't need the dataOffsets, but we want to imitate the real
175 // memory density, to measure equivalent CPU cache usage.
180 #define MIN(a,b) (((a)<(b)) ? (a) : (b))
183 // Compare strings where we know the shared prefix length,
184 // and advance the prefix length as we find that the strings share even more characters.
185 static int32_t strcmpAfterPrefix(const char *s1
, const char *s2
, int32_t &prefixLength
) {
186 int32_t pl
=prefixLength
;
191 int32_t c1
=(uint8_t)*s1
++;
192 int32_t c2
=(uint8_t)*s2
++;
194 if(cmp
!=0 || c1
==0) { // different or done
197 ++pl
; // increment shared same-prefix length
203 static int32_t prefixBinarySearch(const char *s
, const char *names
, const TOCEntry
*toc
, int32_t count
) {
209 // Remember the shared prefix between s, start and limit,
210 // and don't compare that shared prefix again.
211 // The shared prefix should get longer as we narrow the [start, limit[ range.
212 int32_t startPrefixLength
=0;
213 int32_t limitPrefixLength
=0;
214 // Prime the prefix lengths so that we don't keep prefixLength at 0 until
215 // both the start and limit indexes have moved.
216 // At the same time, we find if s is one of the start and (limit-1) names,
217 // and if not, exclude them from the actual binary search.
218 if(0==strcmpAfterPrefix(s
, names
+toc
[0].nameOffset
, startPrefixLength
)) {
223 if(0==strcmpAfterPrefix(s
, names
+toc
[limit
].nameOffset
, limitPrefixLength
)) {
227 int32_t i
=(start
+limit
)/2;
228 int32_t prefixLength
=MIN(startPrefixLength
, limitPrefixLength
);
229 int32_t cmp
=strcmpAfterPrefix(s
, names
+toc
[i
].nameOffset
, prefixLength
);
232 limitPrefixLength
=prefixLength
;
237 startPrefixLength
=prefixLength
;
243 class PrefixBinarySearchPackageLookup
: public BinarySearchPackageLookup
{
245 PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest
&perf
)
246 : BinarySearchPackageLookup(perf
) {}
248 virtual void call(UErrorCode
* /*pErrorCode*/) {
249 int32_t count
=pkg
.getItemCount();
250 const char *itemNameChars
=itemNames
.data();
251 const char *name
=itemNameChars
;
252 for(int32_t i
=0; i
<count
; ++i
) {
253 if(prefixBinarySearch(name
, itemNameChars
, toc
, count
)<0) {
254 fprintf(stderr
, "item not found: %s\n", name
);
256 name
=strchr(name
, 0)+1;
261 static int32_t bytesTrieLookup(const char *s
, const char *nameTrieBytes
) {
262 BytesTrie
trie(nameTrieBytes
);
263 if(USTRINGTRIE_HAS_VALUE(trie
.next(s
, -1))) {
264 return trie
.getValue();
270 class BytesTriePackageLookup
: public PackageLookup
{
272 BytesTriePackageLookup(const DictionaryTriePerfTest
&perf
)
273 : PackageLookup(perf
) {
274 IcuToolErrorCode
errorCode("BinarySearchPackageLookup()");
275 builder
=new BytesTrieBuilder(errorCode
);
276 int32_t count
=pkg
.getItemCount();
277 for(int32_t i
=0; i
<count
; ++i
) {
278 // The Package class removes the "icudt46l/" prefix.
279 // We restore that here for a fair performance test.
280 // We store all full names so that we do not have to reconstruct them
281 // in the call() function.
282 const char *name
=pkg
.getItem(i
)->name
;
283 int32_t offset
=itemNames
.length();
284 itemNames
.append("icudt46l/", errorCode
);
285 itemNames
.append(name
, -1, errorCode
);
286 // As value, set the data item index.
287 // In a real implementation, we would use that to get the
288 // start and limit offset of the data item.
289 StringPiece
fullName(itemNames
.toStringPiece());
290 fullName
.remove_prefix(offset
);
291 builder
->add(fullName
, i
, errorCode
);
292 // NUL-terminate the name for call() to find the next one.
293 itemNames
.append(0, errorCode
);
295 int32_t length
=builder
->buildStringPiece(USTRINGTRIE_BUILD_SMALL
, errorCode
).length();
296 printf("size of BytesTrie: %6ld\n", (long)length
);
297 // count+1: +1 for the last-item limit offset which we should have always had
298 printf("size of dataOffsets:%6ld\n", (long)((count
+1)*4));
299 printf("total index size: %6ld\n", (long)(length
+(count
+1)*4));
301 virtual ~BytesTriePackageLookup() {
305 virtual void call(UErrorCode
*pErrorCode
) {
306 int32_t count
=pkg
.getItemCount();
307 const char *nameTrieBytes
=builder
->buildStringPiece(USTRINGTRIE_BUILD_SMALL
, *pErrorCode
).data();
308 const char *name
=itemNames
.data();
309 for(int32_t i
=0; i
<count
; ++i
) {
310 if(bytesTrieLookup(name
, nameTrieBytes
)<0) {
311 fprintf(stderr
, "item not found: %s\n", name
);
313 name
=strchr(name
, 0)+1;
318 BytesTrieBuilder
*builder
;
319 CharString itemNames
;
322 // Performance test function object.
323 // Each subclass loads a dictionary text file
324 // from the -s or --sourcedir path plus -f or --file-name.
325 // For example, <ICU source dir>/source/data/brkitr/thaidict.txt.
326 class DictLookup
: public UPerfFunction
{
328 DictLookup(const DictionaryTriePerfTest
&perfTest
) : perf(perfTest
) {}
330 virtual long getOperationsPerIteration() {
331 return perf
.numTextLines
;
335 const DictionaryTriePerfTest
&perf
;
338 // Closely imitate CompactTrieDictionary::matches().
339 // Note: CompactTrieDictionary::matches() is part of its trie implementation,
340 // and while it loops over the text, it knows the current state.
341 // By contrast, this implementation uses UCharsTrie API functions that have to
342 // check the trie state each time and load/store state in the object.
343 // (Whether it hasNext() and whether it is in the middle of a linear-match node.)
345 ucharsTrieMatches(UCharsTrie
&trie
,
346 UText
*text
, int32_t textLimit
,
347 int32_t *lengths
, int &count
, int limit
) {
348 UChar32 c
=utext_next32(text
);
350 // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
351 // b) It also ignores non-BMP code points by casting to UChar!
355 // Should be firstForCodePoint() but CompactTrieDictionary
356 // handles only code units.
357 UStringTrieResult result
=trie
.first(c
);
361 if(USTRINGTRIE_HAS_VALUE(result
)) {
363 // lengths[count++]=(int32_t)utext_getNativeIndex(text);
364 lengths
[count
++]=numChars
; // CompactTrieDictionary just counts chars too.
366 if(result
==USTRINGTRIE_FINAL_VALUE
) {
369 } else if(result
==USTRINGTRIE_NO_MATCH
) {
372 if(numChars
>=textLimit
) {
373 // Note: Why do we have both a text limit and a UText that knows its length?
376 UChar32 c
=utext_next32(text
);
378 // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
379 // b) It also ignores non-BMP code points by casting to UChar!
384 // Should be nextForCodePoint() but CompactTrieDictionary
385 // handles only code units.
389 // Note: CompactTrieDictionary::matches() comments say that it leaves the UText
390 // after the longest prefix match and returns the number of characters
391 // that were matched.
392 if(index
!=lastMatch
) {
393 utext_setNativeIndex(text
, lastMatch
);
395 return lastMatch
-start
;
396 // However, it does not do either of these, so I am not trying to
397 // imitate it (or its docs) 100%.
402 class UCharsTrieDictLookup
: public DictLookup
{
404 UCharsTrieDictLookup(const DictionaryTriePerfTest
&perfTest
)
405 : DictLookup(perfTest
), trie(NULL
) {
406 IcuToolErrorCode
errorCode("UCharsTrieDictLookup()");
407 builder
=new UCharsTrieBuilder(errorCode
);
408 const ULine
*lines
=perf
.getCachedLines();
409 int32_t numLines
=perf
.getNumLines();
410 for(int32_t i
=0; i
<numLines
; ++i
) {
411 // Skip comment lines (start with a character below 'A').
412 if(lines
[i
].name
[0]<0x41) {
415 builder
->add(UnicodeString(FALSE
, lines
[i
].name
, lines
[i
].len
), 0, errorCode
);
417 UnicodeString trieUChars
;
418 int32_t length
=builder
->buildUnicodeString(USTRINGTRIE_BUILD_SMALL
, trieUChars
, errorCode
).length();
419 printf("size of UCharsTrie: %6ld bytes\n", (long)length
*2);
420 trie
=builder
->build(USTRINGTRIE_BUILD_SMALL
, errorCode
);
423 virtual ~UCharsTrieDictLookup() {
429 UCharsTrieBuilder
*builder
;
433 class UCharsTrieDictMatches
: public UCharsTrieDictLookup
{
435 UCharsTrieDictMatches(const DictionaryTriePerfTest
&perfTest
)
436 : UCharsTrieDictLookup(perfTest
) {}
438 virtual void call(UErrorCode
*pErrorCode
) {
439 UText text
=UTEXT_INITIALIZER
;
441 const ULine
*lines
=perf
.getCachedLines();
442 int32_t numLines
=perf
.getNumLines();
443 for(int32_t i
=0; i
<numLines
; ++i
) {
444 // Skip comment lines (start with a character below 'A').
445 if(lines
[i
].name
[0]<0x41) {
448 utext_openUChars(&text
, lines
[i
].name
, lines
[i
].len
, pErrorCode
);
450 ucharsTrieMatches(*trie
, &text
, lines
[i
].len
,
451 lengths
, count
, UPRV_LENGTHOF(lengths
));
452 if(count
==0 || lengths
[count
-1]!=lines
[i
].len
) {
453 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
459 class UCharsTrieDictContains
: public UCharsTrieDictLookup
{
461 UCharsTrieDictContains(const DictionaryTriePerfTest
&perfTest
)
462 : UCharsTrieDictLookup(perfTest
) {}
464 virtual void call(UErrorCode
* /*pErrorCode*/) {
465 const ULine
*lines
=perf
.getCachedLines();
466 int32_t numLines
=perf
.getNumLines();
467 for(int32_t i
=0; i
<numLines
; ++i
) {
468 // Skip comment lines (which start with a character below 'A').
469 if(lines
[i
].name
[0]<0x41) {
472 if(!USTRINGTRIE_HAS_VALUE(trie
->reset().next(lines
[i
].name
, lines
[i
].len
))) {
473 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
479 static inline int32_t thaiCharToByte(UChar32 c
) {
480 if(0xe00<=c
&& c
<=0xefe) {
489 static UBool
thaiWordToBytes(const UChar
*s
, int32_t length
,
490 CharString
&str
, UErrorCode
&errorCode
) {
491 for(int32_t i
=0; i
<length
; ++i
) {
493 int32_t b
=thaiCharToByte(c
);
495 str
.append((char)b
, errorCode
);
497 fprintf(stderr
, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c
);
504 class BytesTrieDictLookup
: public DictLookup
{
506 BytesTrieDictLookup(const DictionaryTriePerfTest
&perfTest
)
507 : DictLookup(perfTest
), trie(NULL
), noDict(FALSE
) {
508 IcuToolErrorCode
errorCode("BytesTrieDictLookup()");
509 builder
=new BytesTrieBuilder(errorCode
);
511 const ULine
*lines
=perf
.getCachedLines();
512 int32_t numLines
=perf
.getNumLines();
513 for(int32_t i
=0; i
<numLines
; ++i
) {
514 // Skip comment lines (start with a character below 'A').
515 if(lines
[i
].name
[0]<0x41) {
518 if(!thaiWordToBytes(lines
[i
].name
, lines
[i
].len
, str
.clear(), errorCode
)) {
519 fprintf(stderr
, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i
);
523 builder
->add(str
.toStringPiece(), 0, errorCode
);
526 int32_t length
=builder
->buildStringPiece(USTRINGTRIE_BUILD_SMALL
, errorCode
).length();
527 printf("size of BytesTrie: %6ld bytes\n", (long)length
);
528 trie
=builder
->build(USTRINGTRIE_BUILD_SMALL
, errorCode
);
532 virtual ~BytesTrieDictLookup() {
538 BytesTrieBuilder
*builder
;
544 bytesTrieMatches(BytesTrie
&trie
,
545 UText
*text
, int32_t textLimit
,
546 int32_t *lengths
, int &count
, int limit
) {
547 UChar32 c
=utext_next32(text
);
551 UStringTrieResult result
=trie
.first(thaiCharToByte(c
));
555 if(USTRINGTRIE_HAS_VALUE(result
)) {
557 // lengths[count++]=(int32_t)utext_getNativeIndex(text);
558 lengths
[count
++]=numChars
; // CompactTrieDictionary just counts chars too.
560 if(result
==USTRINGTRIE_FINAL_VALUE
) {
563 } else if(result
==USTRINGTRIE_NO_MATCH
) {
566 if(numChars
>=textLimit
) {
569 UChar32 c
=utext_next32(text
);
574 result
=trie
.next(thaiCharToByte(c
));
579 class BytesTrieDictMatches
: public BytesTrieDictLookup
{
581 BytesTrieDictMatches(const DictionaryTriePerfTest
&perfTest
)
582 : BytesTrieDictLookup(perfTest
) {}
584 virtual void call(UErrorCode
*pErrorCode
) {
588 UText text
=UTEXT_INITIALIZER
;
590 const ULine
*lines
=perf
.getCachedLines();
591 int32_t numLines
=perf
.getNumLines();
592 for(int32_t i
=0; i
<numLines
; ++i
) {
593 // Skip comment lines (start with a character below 'A').
594 if(lines
[i
].name
[0]<0x41) {
597 utext_openUChars(&text
, lines
[i
].name
, lines
[i
].len
, pErrorCode
);
599 bytesTrieMatches(*trie
, &text
, lines
[i
].len
,
600 lengths
, count
, UPRV_LENGTHOF(lengths
));
601 if(count
==0 || lengths
[count
-1]!=lines
[i
].len
) {
602 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
608 class BytesTrieDictContains
: public BytesTrieDictLookup
{
610 BytesTrieDictContains(const DictionaryTriePerfTest
&perfTest
)
611 : BytesTrieDictLookup(perfTest
) {}
613 virtual void call(UErrorCode
* /*pErrorCode*/) {
617 const ULine
*lines
=perf
.getCachedLines();
618 int32_t numLines
=perf
.getNumLines();
619 for(int32_t i
=0; i
<numLines
; ++i
) {
620 const UChar
*line
=lines
[i
].name
;
621 // Skip comment lines (start with a character below 'A').
625 UStringTrieResult result
=trie
->first(thaiCharToByte(line
[0]));
626 int32_t lineLength
=lines
[i
].len
;
627 for(int32_t j
=1; j
<lineLength
; ++j
) {
628 if(!USTRINGTRIE_HAS_NEXT(result
)) {
629 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
632 result
=trie
->next(thaiCharToByte(line
[j
]));
634 if(!USTRINGTRIE_HAS_VALUE(result
)) {
635 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
641 UPerfFunction
*DictionaryTriePerfTest::runIndexedTest(int32_t index
, UBool exec
,
642 const char *&name
, char * /*par*/) {
646 name
="ucharstriematches";
648 return new UCharsTrieDictMatches(*this);
652 name
="ucharstriecontains";
654 return new UCharsTrieDictContains(*this);
658 name
="bytestriematches";
660 return new BytesTrieDictMatches(*this);
664 name
="bytestriecontains";
666 return new BytesTrieDictContains(*this);
674 if(index
==0 && exec
) {
675 puts("Running BytesTrie perf tests on the .dat package file from the --sourcedir.\n"
676 "For UCharsTrie perf tests on a dictionary text file, specify the -f or --file-name.\n");
680 name
="simplebinarysearch";
682 return new BinarySearchPackageLookup(*this);
686 name
="prefixbinarysearch";
688 return new PrefixBinarySearchPackageLookup(*this);
694 return new BytesTriePackageLookup(*this);
705 int main(int argc
, const char *argv
[]) {
706 IcuToolErrorCode
errorCode("dicttrieperf main()");
707 DictionaryTriePerfTest
test(argc
, argv
, errorCode
);
708 if(errorCode
.isFailure()) {
709 fprintf(stderr
, "DictionaryTriePerfTest() failed: %s\n", errorCode
.errorName());
711 return errorCode
.reset();
714 fprintf(stderr
, "FAILED: Tests could not be run, please check the arguments.\n");