2 **********************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: dicttrieperf.cpp
8 * tab size: 8 (not used)
11 * created on: 2010dec09
12 * created by: Markus W. Scherer
14 * Performance test program for dictionary-type tries.
16 * Usage from within <ICU build tree>/test/perf/dicttrieperf/ :
19 * export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw
20 * ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000
22 * ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250
27 #include "unicode/bytestrie.h"
28 #include "unicode/bytestriebuilder.h"
29 #include "unicode/localpointer.h"
30 #include "unicode/ucharstrie.h"
31 #include "unicode/ucharstriebuilder.h"
32 #include "unicode/uperf.h"
33 #include "unicode/utext.h"
37 #include "ucbuf.h" // struct ULine
41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
44 class DictionaryTriePerfTest
: public UPerfTest
{
46 DictionaryTriePerfTest(int32_t argc
, const char *argv
[], UErrorCode
&status
)
47 : UPerfTest(argc
, argv
, NULL
, 0, "", status
), numTextLines(0) {
50 for(int32_t i
=0; i
<numLines
; ++i
) {
51 // Skip comment lines (start with a character below 'A').
52 if(lines
[i
].name
[0]>=0x41) {
54 // Remove trailing CR LF.
55 int32_t len
=lines
[i
].len
;
57 while(len
>0 && ((c
=lines
[i
].name
[len
-1])==0xa || c
==0xd)) {
66 virtual UPerfFunction
*runIndexedTest(int32_t index
, UBool exec
, const char *&name
, char *par
=NULL
);
68 const char *getSourceDir() const { return sourceDir
; }
70 UBool
hasFile() const { return ucharBuf
!=NULL
; }
71 const ULine
*getCachedLines() const { return lines
; }
72 int32_t getNumLines() const { return numLines
; }
73 int32_t numTextLines
; // excluding comment lines
76 // Performance test function object.
77 // Loads icudt46l.dat (or whatever its current versioned filename)
78 // from the -s or --sourcedir path.
79 class PackageLookup
: public UPerfFunction
{
81 PackageLookup(const DictionaryTriePerfTest
&perf
) {
82 IcuToolErrorCode
errorCode("PackageLookup()");
83 CharString
filename(perf
.getSourceDir(), errorCode
);
84 int32_t filenameLength
=filename
.length();
85 if(filenameLength
>0 && filename
[filenameLength
-1]!=U_FILE_SEP_CHAR
&&
86 filename
[filenameLength
-1]!=U_FILE_ALT_SEP_CHAR
) {
87 filename
.append(U_FILE_SEP_CHAR
, errorCode
);
89 filename
.append(U_ICUDATA_NAME
, errorCode
);
90 filename
.append(".dat", errorCode
);
91 pkg
.readPackage(filename
.data());
95 virtual ~PackageLookup() {}
97 // virtual void call(UErrorCode* pErrorCode) { ... }
99 virtual long getOperationsPerIteration() {
100 return pkg
.getItemCount();
103 // virtual long getEventsPerIteration();
110 int32_t nameOffset
, dataOffset
;
113 // Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c).
114 static int32_t simpleBinarySearch(const char *s
, const char *names
, const TOCEntry
*toc
, int32_t count
) {
117 int32_t lastNumber
=limit
;
119 int32_t number
=(start
+limit
)/2;
120 if(lastNumber
==number
) { // have we moved?
121 return -1; // not found
124 int32_t cmp
=strcmp(s
, names
+toc
[number
].nameOffset
);
135 class BinarySearchPackageLookup
: public PackageLookup
{
137 BinarySearchPackageLookup(const DictionaryTriePerfTest
&perf
)
138 : PackageLookup(perf
) {
139 IcuToolErrorCode
errorCode("BinarySearchPackageLookup()");
140 int32_t count
=pkg
.getItemCount();
141 toc
=new TOCEntry
[count
];
142 for(int32_t i
=0; i
<count
; ++i
) {
143 toc
[i
].nameOffset
=itemNames
.length();
144 toc
[i
].dataOffset
=i
; // arbitrary value, see toc comment below
145 // The Package class removes the "icudt46l/" prefix.
146 // We restore that here for a fair performance test.
147 const char *name
=pkg
.getItem(i
)->name
;
148 itemNames
.append("icudt46l/", errorCode
);
149 itemNames
.append(name
, strlen(name
)+1, errorCode
);
151 printf("size of item names: %6ld\n", (long)itemNames
.length());
152 printf("size of TOC: %6ld\n", (long)(count
*8));
153 printf("total index size: %6ld\n", (long)(itemNames
.length()+count
*8));
155 virtual ~BinarySearchPackageLookup() {
159 virtual void call(UErrorCode
* /*pErrorCode*/) {
160 int32_t count
=pkg
.getItemCount();
161 const char *itemNameChars
=itemNames
.data();
162 const char *name
=itemNameChars
;
163 for(int32_t i
=0; i
<count
; ++i
) {
164 if(simpleBinarySearch(name
, itemNameChars
, toc
, count
)<0) {
165 fprintf(stderr
, "item not found: %s\n", name
);
167 name
=strchr(name
, 0)+1;
172 CharString itemNames
;
173 // toc imitates a .dat file's array of UDataOffsetTOCEntry
174 // with nameOffset and dataOffset.
175 // We don't need the dataOffsets, but we want to imitate the real
176 // memory density, to measure equivalent CPU cache usage.
181 #define MIN(a,b) (((a)<(b)) ? (a) : (b))
184 // Compare strings where we know the shared prefix length,
185 // and advance the prefix length as we find that the strings share even more characters.
186 static int32_t strcmpAfterPrefix(const char *s1
, const char *s2
, int32_t &prefixLength
) {
187 int32_t pl
=prefixLength
;
192 int32_t c1
=(uint8_t)*s1
++;
193 int32_t c2
=(uint8_t)*s2
++;
195 if(cmp
!=0 || c1
==0) { // different or done
198 ++pl
; // increment shared same-prefix length
204 static int32_t prefixBinarySearch(const char *s
, const char *names
, const TOCEntry
*toc
, int32_t count
) {
210 // Remember the shared prefix between s, start and limit,
211 // and don't compare that shared prefix again.
212 // The shared prefix should get longer as we narrow the [start, limit[ range.
213 int32_t startPrefixLength
=0;
214 int32_t limitPrefixLength
=0;
215 // Prime the prefix lengths so that we don't keep prefixLength at 0 until
216 // both the start and limit indexes have moved.
217 // At the same time, we find if s is one of the start and (limit-1) names,
218 // and if not, exclude them from the actual binary search.
219 if(0==strcmpAfterPrefix(s
, names
+toc
[0].nameOffset
, startPrefixLength
)) {
224 if(0==strcmpAfterPrefix(s
, names
+toc
[limit
].nameOffset
, limitPrefixLength
)) {
228 int32_t i
=(start
+limit
)/2;
229 int32_t prefixLength
=MIN(startPrefixLength
, limitPrefixLength
);
230 int32_t cmp
=strcmpAfterPrefix(s
, names
+toc
[i
].nameOffset
, prefixLength
);
233 limitPrefixLength
=prefixLength
;
238 startPrefixLength
=prefixLength
;
244 class PrefixBinarySearchPackageLookup
: public BinarySearchPackageLookup
{
246 PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest
&perf
)
247 : BinarySearchPackageLookup(perf
) {}
249 virtual void call(UErrorCode
* /*pErrorCode*/) {
250 int32_t count
=pkg
.getItemCount();
251 const char *itemNameChars
=itemNames
.data();
252 const char *name
=itemNameChars
;
253 for(int32_t i
=0; i
<count
; ++i
) {
254 if(prefixBinarySearch(name
, itemNameChars
, toc
, count
)<0) {
255 fprintf(stderr
, "item not found: %s\n", name
);
257 name
=strchr(name
, 0)+1;
262 static int32_t bytesTrieLookup(const char *s
, const char *nameTrieBytes
) {
263 BytesTrie
trie(nameTrieBytes
);
264 if(USTRINGTRIE_HAS_VALUE(trie
.next(s
, -1))) {
265 return trie
.getValue();
271 class BytesTriePackageLookup
: public PackageLookup
{
273 BytesTriePackageLookup(const DictionaryTriePerfTest
&perf
)
274 : PackageLookup(perf
) {
275 IcuToolErrorCode
errorCode("BinarySearchPackageLookup()");
276 builder
=new BytesTrieBuilder(errorCode
);
277 int32_t count
=pkg
.getItemCount();
278 for(int32_t i
=0; i
<count
; ++i
) {
279 // The Package class removes the "icudt46l/" prefix.
280 // We restore that here for a fair performance test.
281 // We store all full names so that we do not have to reconstruct them
282 // in the call() function.
283 const char *name
=pkg
.getItem(i
)->name
;
284 int32_t offset
=itemNames
.length();
285 itemNames
.append("icudt46l/", errorCode
);
286 itemNames
.append(name
, -1, errorCode
);
287 // As value, set the data item index.
288 // In a real implementation, we would use that to get the
289 // start and limit offset of the data item.
290 StringPiece
fullName(itemNames
.toStringPiece());
291 fullName
.remove_prefix(offset
);
292 builder
->add(fullName
, i
, errorCode
);
293 // NUL-terminate the name for call() to find the next one.
294 itemNames
.append(0, errorCode
);
296 int32_t length
=builder
->buildStringPiece(USTRINGTRIE_BUILD_SMALL
, errorCode
).length();
297 printf("size of BytesTrie: %6ld\n", (long)length
);
298 // count+1: +1 for the last-item limit offset which we should have always had
299 printf("size of dataOffsets:%6ld\n", (long)((count
+1)*4));
300 printf("total index size: %6ld\n", (long)(length
+(count
+1)*4));
302 virtual ~BytesTriePackageLookup() {
306 virtual void call(UErrorCode
*pErrorCode
) {
307 int32_t count
=pkg
.getItemCount();
308 const char *nameTrieBytes
=builder
->buildStringPiece(USTRINGTRIE_BUILD_SMALL
, *pErrorCode
).data();
309 const char *name
=itemNames
.data();
310 for(int32_t i
=0; i
<count
; ++i
) {
311 if(bytesTrieLookup(name
, nameTrieBytes
)<0) {
312 fprintf(stderr
, "item not found: %s\n", name
);
314 name
=strchr(name
, 0)+1;
319 BytesTrieBuilder
*builder
;
320 CharString itemNames
;
323 // Performance test function object.
324 // Each subclass loads a dictionary text file
325 // from the -s or --sourcedir path plus -f or --file-name.
326 // For example, <ICU source dir>/source/data/brkitr/thaidict.txt.
327 class DictLookup
: public UPerfFunction
{
329 DictLookup(const DictionaryTriePerfTest
&perfTest
) : perf(perfTest
) {}
331 virtual long getOperationsPerIteration() {
332 return perf
.numTextLines
;
336 const DictionaryTriePerfTest
&perf
;
339 // Closely imitate CompactTrieDictionary::matches().
340 // Note: CompactTrieDictionary::matches() is part of its trie implementation,
341 // and while it loops over the text, it knows the current state.
342 // By contrast, this implementation uses UCharsTrie API functions that have to
343 // check the trie state each time and load/store state in the object.
344 // (Whether it hasNext() and whether it is in the middle of a linear-match node.)
346 ucharsTrieMatches(UCharsTrie
&trie
,
347 UText
*text
, int32_t textLimit
,
348 int32_t *lengths
, int &count
, int limit
) {
349 UChar32 c
=utext_next32(text
);
351 // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
352 // b) It also ignores non-BMP code points by casting to UChar!
356 // Should be firstForCodePoint() but CompactTrieDictionary
357 // handles only code units.
358 UStringTrieResult result
=trie
.first(c
);
362 if(USTRINGTRIE_HAS_VALUE(result
)) {
364 // lengths[count++]=(int32_t)utext_getNativeIndex(text);
365 lengths
[count
++]=numChars
; // CompactTrieDictionary just counts chars too.
367 if(result
==USTRINGTRIE_FINAL_VALUE
) {
370 } else if(result
==USTRINGTRIE_NO_MATCH
) {
373 if(numChars
>=textLimit
) {
374 // Note: Why do we have both a text limit and a UText that knows its length?
377 UChar32 c
=utext_next32(text
);
379 // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
380 // b) It also ignores non-BMP code points by casting to UChar!
385 // Should be nextForCodePoint() but CompactTrieDictionary
386 // handles only code units.
390 // Note: CompactTrieDictionary::matches() comments say that it leaves the UText
391 // after the longest prefix match and returns the number of characters
392 // that were matched.
393 if(index
!=lastMatch
) {
394 utext_setNativeIndex(text
, lastMatch
);
396 return lastMatch
-start
;
397 // However, it does not do either of these, so I am not trying to
398 // imitate it (or its docs) 100%.
403 class UCharsTrieDictLookup
: public DictLookup
{
405 UCharsTrieDictLookup(const DictionaryTriePerfTest
&perfTest
)
406 : DictLookup(perfTest
), trie(NULL
) {
407 IcuToolErrorCode
errorCode("UCharsTrieDictLookup()");
408 builder
=new UCharsTrieBuilder(errorCode
);
409 const ULine
*lines
=perf
.getCachedLines();
410 int32_t numLines
=perf
.getNumLines();
411 for(int32_t i
=0; i
<numLines
; ++i
) {
412 // Skip comment lines (start with a character below 'A').
413 if(lines
[i
].name
[0]<0x41) {
416 builder
->add(UnicodeString(FALSE
, lines
[i
].name
, lines
[i
].len
), 0, errorCode
);
418 UnicodeString trieUChars
;
419 int32_t length
=builder
->buildUnicodeString(USTRINGTRIE_BUILD_SMALL
, trieUChars
, errorCode
).length();
420 printf("size of UCharsTrie: %6ld bytes\n", (long)length
*2);
421 trie
=builder
->build(USTRINGTRIE_BUILD_SMALL
, errorCode
);
424 virtual ~UCharsTrieDictLookup() {
430 UCharsTrieBuilder
*builder
;
434 class UCharsTrieDictMatches
: public UCharsTrieDictLookup
{
436 UCharsTrieDictMatches(const DictionaryTriePerfTest
&perfTest
)
437 : UCharsTrieDictLookup(perfTest
) {}
439 virtual void call(UErrorCode
*pErrorCode
) {
440 UText text
=UTEXT_INITIALIZER
;
442 const ULine
*lines
=perf
.getCachedLines();
443 int32_t numLines
=perf
.getNumLines();
444 for(int32_t i
=0; i
<numLines
; ++i
) {
445 // Skip comment lines (start with a character below 'A').
446 if(lines
[i
].name
[0]<0x41) {
449 utext_openUChars(&text
, lines
[i
].name
, lines
[i
].len
, pErrorCode
);
451 ucharsTrieMatches(*trie
, &text
, lines
[i
].len
,
452 lengths
, count
, LENGTHOF(lengths
));
453 if(count
==0 || lengths
[count
-1]!=lines
[i
].len
) {
454 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
460 class UCharsTrieDictContains
: public UCharsTrieDictLookup
{
462 UCharsTrieDictContains(const DictionaryTriePerfTest
&perfTest
)
463 : UCharsTrieDictLookup(perfTest
) {}
465 virtual void call(UErrorCode
* /*pErrorCode*/) {
466 const ULine
*lines
=perf
.getCachedLines();
467 int32_t numLines
=perf
.getNumLines();
468 for(int32_t i
=0; i
<numLines
; ++i
) {
469 // Skip comment lines (which start with a character below 'A').
470 if(lines
[i
].name
[0]<0x41) {
473 if(!USTRINGTRIE_HAS_VALUE(trie
->reset().next(lines
[i
].name
, lines
[i
].len
))) {
474 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
480 static inline int32_t thaiCharToByte(UChar32 c
) {
481 if(0xe00<=c
&& c
<=0xefe) {
490 static UBool
thaiWordToBytes(const UChar
*s
, int32_t length
,
491 CharString
&str
, UErrorCode
&errorCode
) {
492 for(int32_t i
=0; i
<length
; ++i
) {
494 int32_t b
=thaiCharToByte(c
);
496 str
.append((char)b
, errorCode
);
498 fprintf(stderr
, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c
);
505 class BytesTrieDictLookup
: public DictLookup
{
507 BytesTrieDictLookup(const DictionaryTriePerfTest
&perfTest
)
508 : DictLookup(perfTest
), trie(NULL
), noDict(FALSE
) {
509 IcuToolErrorCode
errorCode("BytesTrieDictLookup()");
510 builder
=new BytesTrieBuilder(errorCode
);
512 const ULine
*lines
=perf
.getCachedLines();
513 int32_t numLines
=perf
.getNumLines();
514 for(int32_t i
=0; i
<numLines
; ++i
) {
515 // Skip comment lines (start with a character below 'A').
516 if(lines
[i
].name
[0]<0x41) {
519 if(!thaiWordToBytes(lines
[i
].name
, lines
[i
].len
, str
.clear(), errorCode
)) {
520 fprintf(stderr
, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i
);
524 builder
->add(str
.toStringPiece(), 0, errorCode
);
527 int32_t length
=builder
->buildStringPiece(USTRINGTRIE_BUILD_SMALL
, errorCode
).length();
528 printf("size of BytesTrie: %6ld bytes\n", (long)length
);
529 trie
=builder
->build(USTRINGTRIE_BUILD_SMALL
, errorCode
);
533 virtual ~BytesTrieDictLookup() {
539 BytesTrieBuilder
*builder
;
545 bytesTrieMatches(BytesTrie
&trie
,
546 UText
*text
, int32_t textLimit
,
547 int32_t *lengths
, int &count
, int limit
) {
548 UChar32 c
=utext_next32(text
);
552 UStringTrieResult result
=trie
.first(thaiCharToByte(c
));
556 if(USTRINGTRIE_HAS_VALUE(result
)) {
558 // lengths[count++]=(int32_t)utext_getNativeIndex(text);
559 lengths
[count
++]=numChars
; // CompactTrieDictionary just counts chars too.
561 if(result
==USTRINGTRIE_FINAL_VALUE
) {
564 } else if(result
==USTRINGTRIE_NO_MATCH
) {
567 if(numChars
>=textLimit
) {
570 UChar32 c
=utext_next32(text
);
575 result
=trie
.next(thaiCharToByte(c
));
580 class BytesTrieDictMatches
: public BytesTrieDictLookup
{
582 BytesTrieDictMatches(const DictionaryTriePerfTest
&perfTest
)
583 : BytesTrieDictLookup(perfTest
) {}
585 virtual void call(UErrorCode
*pErrorCode
) {
589 UText text
=UTEXT_INITIALIZER
;
591 const ULine
*lines
=perf
.getCachedLines();
592 int32_t numLines
=perf
.getNumLines();
593 for(int32_t i
=0; i
<numLines
; ++i
) {
594 // Skip comment lines (start with a character below 'A').
595 if(lines
[i
].name
[0]<0x41) {
598 utext_openUChars(&text
, lines
[i
].name
, lines
[i
].len
, pErrorCode
);
600 bytesTrieMatches(*trie
, &text
, lines
[i
].len
,
601 lengths
, count
, LENGTHOF(lengths
));
602 if(count
==0 || lengths
[count
-1]!=lines
[i
].len
) {
603 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
609 class BytesTrieDictContains
: public BytesTrieDictLookup
{
611 BytesTrieDictContains(const DictionaryTriePerfTest
&perfTest
)
612 : BytesTrieDictLookup(perfTest
) {}
614 virtual void call(UErrorCode
* /*pErrorCode*/) {
618 const ULine
*lines
=perf
.getCachedLines();
619 int32_t numLines
=perf
.getNumLines();
620 for(int32_t i
=0; i
<numLines
; ++i
) {
621 const UChar
*line
=lines
[i
].name
;
622 // Skip comment lines (start with a character below 'A').
626 UStringTrieResult result
=trie
->first(thaiCharToByte(line
[0]));
627 int32_t lineLength
=lines
[i
].len
;
628 for(int32_t j
=1; j
<lineLength
; ++j
) {
629 if(!USTRINGTRIE_HAS_NEXT(result
)) {
630 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
633 result
=trie
->next(thaiCharToByte(line
[j
]));
635 if(!USTRINGTRIE_HAS_VALUE(result
)) {
636 fprintf(stderr
, "word %ld (0-based) not found\n", (long)i
);
642 UPerfFunction
*DictionaryTriePerfTest::runIndexedTest(int32_t index
, UBool exec
,
643 const char *&name
, char * /*par*/) {
647 name
="ucharstriematches";
649 return new UCharsTrieDictMatches(*this);
653 name
="ucharstriecontains";
655 return new UCharsTrieDictContains(*this);
659 name
="bytestriematches";
661 return new BytesTrieDictMatches(*this);
665 name
="bytestriecontains";
667 return new BytesTrieDictContains(*this);
675 if(index
==0 && exec
) {
676 puts("Running BytesTrie perf tests on the .dat package file from the --sourcedir.\n"
677 "For UCharsTrie perf tests on a dictionary text file, specify the -f or --file-name.\n");
681 name
="simplebinarysearch";
683 return new BinarySearchPackageLookup(*this);
687 name
="prefixbinarysearch";
689 return new PrefixBinarySearchPackageLookup(*this);
695 return new BytesTriePackageLookup(*this);
706 int main(int argc
, const char *argv
[]) {
707 IcuToolErrorCode
errorCode("dicttrieperf main()");
708 DictionaryTriePerfTest
test(argc
, argv
, errorCode
);
709 if(errorCode
.isFailure()) {
710 fprintf(stderr
, "DictionaryTriePerfTest() failed: %s\n", errorCode
.errorName());
712 return errorCode
.reset();
715 fprintf(stderr
, "FAILED: Tests could not be run, please check the arguments.\n");