1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*************************************************************************
4 * Copyright (c) 2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *************************************************************************
8 #ifndef RBBIMONKEYTEST_H
9 #define RBBIMONKEYTEST_H
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
17 #include "unicode/rbbi.h"
18 #include "unicode/regex.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 #include "unicode/uobject.h"
23 #include "simplethread.h"
28 // RBBI Monkey Test. Run break iterators against randomly generated strings, compare results with
29 // an independent reference implementation.
31 // The monkey test can be run with parameters, e.g.
32 // intltest rbbi/RBBIMonkeyTest@loop=-1,rules=word.txt
33 // will run word break testing in an infinite loop.
35 // rules=name Test against the named reference rule file.
36 // Files are found in source/test/testdata/break_rules
37 // loop=nnn Loop nnn times. -1 for no limit. loop of 1 is useful for debugging.
38 // seed=nnnn Random number generator seed. Allows recreation of a failure.
39 // Error messages include the necessary seed value.
40 // verbose Display details of a failure. Useful for debugging. Use with loop=1.
41 // expansions Debug option, show expansions of rules and sets.
44 // Develop a tailoring format.
45 // Hook to old tests that use monkey impl to get expected data.
48 class BreakRules
; // Forward declaration
52 * Test the RuleBasedBreakIterator class giving different rules
54 class RBBIMonkeyTest
: public IntlTest
{
57 virtual ~RBBIMonkeyTest();
59 void runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* par
= NULL
);
64 const char *fParams
; // Copy of user parameters passed in from IntlTest.
67 void testRules(const char *ruleFile
);
68 static UBool
getIntParam(UnicodeString name
, UnicodeString
¶ms
, int64_t &val
, UErrorCode
&status
);
69 static UBool
getStringParam(UnicodeString name
, UnicodeString
¶ms
, CharString
&dest
, UErrorCode
&status
);
70 static UBool
getBoolParam(UnicodeString name
, UnicodeString
¶ms
, UBool
&dest
, UErrorCode
&status
);
74 // The following classes are internal to the RBBI Monkey Test implementation.
78 // class CharClass Represents a single character class from the source break rules.
79 // Inherits from UObject because instances are adopted by UHashtable, which ultimately
80 // deletes them using hash's object deleter function.
82 class CharClass
: public UObject
{
85 UnicodeString fOriginalDef
; // set definition as it appeared in user supplied rules.
86 UnicodeString fExpandedDef
; // set definition with any embedded named sets replaced by their defs, recursively.
87 LocalPointer
<const UnicodeSet
> fSet
;
88 CharClass(const UnicodeString
&name
, const UnicodeString
&originalDef
, const UnicodeString
&expandedDef
, const UnicodeSet
*set
) :
89 fName(name
), fOriginalDef(originalDef
), fExpandedDef(expandedDef
), fSet(set
) {}
93 // class BreakRule represents a single rule from a set of break rules.
94 // Each rule has the set definitions expanded, and
95 // is compiled to a regular expression.
97 class BreakRule
: public UObject
{
101 UnicodeString fName
; // Name of the rule.
102 UnicodeString fRule
; // Rule expression, excluding the name, as written in user source.
103 UnicodeString fExpandedRule
; // Rule expression after expanding the set definitions.
104 LocalPointer
<RegexMatcher
> fRuleMatcher
; // Regular expression that matches the rule.
105 bool fInitialMatchOnly
= false; // True if rule begins with '^', meaning no chaining.
109 // class BreakRules represents a complete set of break rules, possibly tailored,
110 // compiled from testdata break rules.
112 class BreakRules
: public UObject
{
114 BreakRules(RBBIMonkeyImpl
*monkeyImpl
, UErrorCode
&status
);
117 void compileRules(UCHARBUF
*rules
, UErrorCode
&status
);
119 const CharClass
*getClassForChar(UChar32 c
, int32_t *iter
=NULL
) const;
122 RBBIMonkeyImpl
*fMonkeyImpl
; // Pointer back to the owning MonkeyImpl instance.
123 icu::UVector fBreakRules
; // Contents are of type (BreakRule *).
125 LocalUHashtablePointer fCharClasses
; // Key is set name (UnicodeString).
126 // Value is (CharClass *)
127 LocalPointer
<UVector
> fCharClassList
; // Char Classes, same contents as fCharClasses values,
128 // but in a vector so they can be accessed by index.
129 UnicodeSet fDictionarySet
; // Dictionary set, empty if none is defined.
131 UBreakIteratorType fType
;
133 CharClass
*addCharClass(const UnicodeString
&name
, const UnicodeString
&def
, UErrorCode
&status
);
134 void addRule(const UnicodeString
&name
, const UnicodeString
&def
, UErrorCode
&status
);
135 bool setKeywordParameter(const UnicodeString
&keyword
, const UnicodeString
&value
, UErrorCode
&status
);
136 RuleBasedBreakIterator
*createICUBreakIterator(UErrorCode
&status
);
138 LocalPointer
<RegexMatcher
> fSetRefsMatcher
;
139 LocalPointer
<RegexMatcher
> fCommentsMatcher
;
140 LocalPointer
<RegexMatcher
> fClassDefMatcher
;
141 LocalPointer
<RegexMatcher
> fRuleDefMatcher
;
145 // class MonkeyTestData represents a randomly synthesized test data string together
146 // with the expected break positions obtained by applying
147 // the test break rules.
149 class MonkeyTestData
: public UObject
{
152 ~MonkeyTestData() {};
153 void set(BreakRules
*rules
, IntlTest::icu_rand
&rand
, UErrorCode
&status
);
154 void clearActualBreaks();
155 void dump(int32_t around
= -1) const;
157 uint32_t fRandomSeed
; // The initial seed value from the random number genererator.
158 const BreakRules
*fBkRules
; // The break rules used to generate this data.
159 UnicodeString fString
; // The text.
160 UnicodeString fExpectedBreaks
; // Breaks as found by the reference rules.
161 // Parallel to fString. Non-zero if break preceding.
162 UnicodeString fActualBreaks
; // Breaks as found by ICU break iterator.
163 UnicodeString fRuleForPosition
; // Index into BreakRules.fBreakRules of rule that applied at each position.
164 // Also parallel to fString.
165 UnicodeString f2ndRuleForPos
; // As above. A 2nd rule applies when the preceding rule
166 // didn't cause a break, and a subsequent rule match starts
167 // on the last code point of the preceding match.
174 // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey
175 // test for one set of break rules.
177 // When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
178 // between instances of RBBIMonkeyImpl and threads.
180 class RBBIMonkeyImpl
: public UObject
{
182 RBBIMonkeyImpl(UErrorCode
&status
);
185 void setup(const char *ruleFileName
, UErrorCode
&status
);
191 LocalUCHARBUFPointer fRuleCharBuffer
; // source file contents of the reference rules.
192 LocalPointer
<BreakRules
> fRuleSet
;
193 LocalPointer
<RuleBasedBreakIterator
> fBI
;
194 LocalPointer
<MonkeyTestData
> fTestData
;
195 IntlTest::icu_rand fRandomGenerator
;
196 const char *fRuleFileName
;
197 UBool fVerbose
; // True to do long dump of failing data.
200 UBool fDumpExpansions
; // Debug flag to output epananded form of rules and sets.
202 enum CheckDirection
{
206 void clearActualBreaks();
207 void testForwards(UErrorCode
&status
);
208 void testPrevious(UErrorCode
&status
);
209 void testFollowing(UErrorCode
&status
);
210 void testPreceding(UErrorCode
&status
);
211 void testIsBoundary(UErrorCode
&status
);
212 void testIsBoundaryRandom(UErrorCode
&status
);
213 void checkResults(const char *msg
, CheckDirection dir
, UErrorCode
&status
);
215 class RBBIMonkeyThread
: public SimpleThread
{
217 RBBIMonkeyImpl
*fMonkeyImpl
;
219 RBBIMonkeyThread(RBBIMonkeyImpl
*impl
) : fMonkeyImpl(impl
) {};
220 void run() U_OVERRIDE
{ fMonkeyImpl
->runTest(); };
223 void openBreakRules(const char *fileName
, UErrorCode
&status
);
224 RBBIMonkeyThread fThread
;
228 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
230 #endif // RBBIMONKEYTEST_H