]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbimonkeytest.h
ICU-64243.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbimonkeytest.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*************************************************************************
4 * Copyright (c) 2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *************************************************************************
7 */
8 #ifndef RBBIMONKEYTEST_H
9 #define RBBIMONKEYTEST_H
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
14
15 #include "intltest.h"
16
17 #include "unicode/rbbi.h"
18 #include "unicode/regex.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 #include "unicode/uobject.h"
22
23 #include "simplethread.h"
24 #include "ucbuf.h"
25 #include "uhash.h"
26 #include "uvector.h"
27
28 // RBBI Monkey Test. Run break iterators against randomly generated strings, compare results with
29 // an independent reference implementation.
30 //
31 // The monkey test can be run with parameters, e.g.
32 // intltest rbbi/RBBIMonkeyTest@loop=-1,rules=word.txt
33 // will run word break testing in an infinite loop.
34 // Summary of options
35 // rules=name Test against the named reference rule file.
36 // Files are found in source/test/testdata/break_rules
37 // loop=nnn Loop nnn times. -1 for no limit. loop of 1 is useful for debugging.
38 // seed=nnnn Random number generator seed. Allows recreation of a failure.
39 // Error messages include the necessary seed value.
40 // verbose Display details of a failure. Useful for debugging. Use with loop=1.
41 // expansions Debug option, show expansions of rules and sets.
42 //
43 // TODO:
44 // Develop a tailoring format.
45 // Hook to old tests that use monkey impl to get expected data.
46 // Remove old tests.
47
48 class BreakRules; // Forward declaration
49 class RBBIMonkeyImpl;
50
51 /**
52 * Test the RuleBasedBreakIterator class giving different rules
53 */
54 class RBBIMonkeyTest: public IntlTest {
55 public:
56 RBBIMonkeyTest();
57 virtual ~RBBIMonkeyTest();
58
59 void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
60 void testMonkey();
61
62
63 private:
64 const char *fParams; // Copy of user parameters passed in from IntlTest.
65
66
67 void testRules(const char *ruleFile);
68 static UBool getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status);
69 static UBool getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status);
70 static UBool getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status);
71
72 };
73
74 // The following classes are internal to the RBBI Monkey Test implementation.
75
76
77
78 // class CharClass Represents a single character class from the source break rules.
79 // Inherits from UObject because instances are adopted by UHashtable, which ultimately
80 // deletes them using hash's object deleter function.
81
82 class CharClass: public UObject {
83 public:
84 UnicodeString fName;
85 UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules.
86 UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively.
87 LocalPointer<const UnicodeSet> fSet;
88 CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
89 fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
90 };
91
92
93 // class BreakRule represents a single rule from a set of break rules.
94 // Each rule has the set definitions expanded, and
95 // is compiled to a regular expression.
96
97 class BreakRule: public UObject {
98 public:
99 BreakRule();
100 ~BreakRule();
101 UnicodeString fName; // Name of the rule.
102 UnicodeString fRule; // Rule expression, excluding the name, as written in user source.
103 UnicodeString fExpandedRule; // Rule expression after expanding the set definitions.
104 LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule.
105 bool fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining.
106 };
107
108
109 // class BreakRules represents a complete set of break rules, possibly tailored,
110 // compiled from testdata break rules.
111
112 class BreakRules: public UObject {
113 public:
114 BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
115 ~BreakRules();
116
117 void compileRules(UCHARBUF *rules, UErrorCode &status);
118
119 const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
120
121
122 RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance.
123 icu::UVector fBreakRules; // Contents are of type (BreakRule *).
124
125 LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString).
126 // Value is (CharClass *)
127 LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values,
128 // but in a vector so they can be accessed by index.
129 UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined.
130 Locale fLocale;
131 UBreakIteratorType fType;
132
133 CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
134 void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
135 bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
136 RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
137
138 LocalPointer<RegexMatcher> fSetRefsMatcher;
139 LocalPointer<RegexMatcher> fCommentsMatcher;
140 LocalPointer<RegexMatcher> fClassDefMatcher;
141 LocalPointer<RegexMatcher> fRuleDefMatcher;
142 };
143
144
145 // class MonkeyTestData represents a randomly synthesized test data string together
146 // with the expected break positions obtained by applying
147 // the test break rules.
148
149 class MonkeyTestData: public UObject {
150 public:
151 MonkeyTestData() {};
152 ~MonkeyTestData() {};
153 void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
154 void clearActualBreaks();
155 void dump(int32_t around = -1) const;
156
157 uint32_t fRandomSeed; // The initial seed value from the random number genererator.
158 const BreakRules *fBkRules; // The break rules used to generate this data.
159 UnicodeString fString; // The text.
160 UnicodeString fExpectedBreaks; // Breaks as found by the reference rules.
161 // Parallel to fString. Non-zero if break preceding.
162 UnicodeString fActualBreaks; // Breaks as found by ICU break iterator.
163 UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position.
164 // Also parallel to fString.
165 UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule
166 // didn't cause a break, and a subsequent rule match starts
167 // on the last code point of the preceding match.
168
169 };
170
171
172
173
174 // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey
175 // test for one set of break rules.
176 //
177 // When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
178 // between instances of RBBIMonkeyImpl and threads.
179 //
180 class RBBIMonkeyImpl: public UObject {
181 public:
182 RBBIMonkeyImpl(UErrorCode &status);
183 ~RBBIMonkeyImpl();
184
185 void setup(const char *ruleFileName, UErrorCode &status);
186
187 void startTest();
188 void runTest();
189 void join();
190
191 LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules.
192 LocalPointer<BreakRules> fRuleSet;
193 LocalPointer<RuleBasedBreakIterator> fBI;
194 LocalPointer<MonkeyTestData> fTestData;
195 IntlTest::icu_rand fRandomGenerator;
196 const char *fRuleFileName;
197 UBool fVerbose; // True to do long dump of failing data.
198 int32_t fLoopCount;
199
200 UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets.
201
202 enum CheckDirection {
203 FORWARD = 1,
204 REVERSE = 2
205 };
206 void clearActualBreaks();
207 void testForwards(UErrorCode &status);
208 void testPrevious(UErrorCode &status);
209 void testFollowing(UErrorCode &status);
210 void testPreceding(UErrorCode &status);
211 void testIsBoundary(UErrorCode &status);
212 void testIsBoundaryRandom(UErrorCode &status);
213 void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
214
215 class RBBIMonkeyThread: public SimpleThread {
216 private:
217 RBBIMonkeyImpl *fMonkeyImpl;
218 public:
219 RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {};
220 void run() U_OVERRIDE { fMonkeyImpl->runTest(); };
221 };
222 private:
223 void openBreakRules(const char *fileName, UErrorCode &status);
224 RBBIMonkeyThread fThread;
225
226 };
227
228 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
229
230 #endif // RBBIMONKEYTEST_H