]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
2ca993e8 A |
3 | /************************************************************************* |
4 | * Copyright (c) 2016, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | ************************************************************************* | |
7 | */ | |
8 | #ifndef RBBIMONKEYTEST_H | |
9 | #define RBBIMONKEYTEST_H | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING | |
14 | ||
15 | #include "intltest.h" | |
16 | ||
17 | #include "unicode/rbbi.h" | |
18 | #include "unicode/regex.h" | |
19 | #include "unicode/uniset.h" | |
20 | #include "unicode/unistr.h" | |
21 | #include "unicode/uobject.h" | |
22 | ||
23 | #include "simplethread.h" | |
24 | #include "ucbuf.h" | |
25 | #include "uhash.h" | |
26 | #include "uvector.h" | |
27 | ||
0f5d89e8 A |
28 | // RBBI Monkey Test. Run break iterators against randomly generated strings, compare results with |
29 | // an independent reference implementation. | |
30 | // | |
31 | // The monkey test can be run with parameters, e.g. | |
32 | // intltest rbbi/RBBIMonkeyTest@loop=-1,rules=word.txt | |
33 | // will run word break testing in an infinite loop. | |
34 | // Summary of options | |
35 | // rules=name Test against the named reference rule file. | |
36 | // Files are found in source/test/testdata/break_rules | |
37 | // loop=nnn Loop nnn times. -1 for no limit. loop of 1 is useful for debugging. | |
38 | // seed=nnnn Random number generator seed. Allows recreation of a failure. | |
39 | // Error messages include the necessary seed value. | |
40 | // verbose Display details of a failure. Useful for debugging. Use with loop=1. | |
41 | // expansions Debug option, show expansions of rules and sets. | |
2ca993e8 A |
42 | // |
43 | // TODO: | |
44 | // Develop a tailoring format. | |
45 | // Hook to old tests that use monkey impl to get expected data. | |
46 | // Remove old tests. | |
47 | ||
48 | class BreakRules; // Forward declaration | |
49 | class RBBIMonkeyImpl; | |
50 | ||
51 | /** | |
52 | * Test the RuleBasedBreakIterator class giving different rules | |
53 | */ | |
54 | class RBBIMonkeyTest: public IntlTest { | |
55 | public: | |
56 | RBBIMonkeyTest(); | |
57 | virtual ~RBBIMonkeyTest(); | |
58 | ||
59 | void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); | |
60 | void testMonkey(); | |
61 | ||
62 | ||
63 | private: | |
64 | const char *fParams; // Copy of user parameters passed in from IntlTest. | |
65 | ||
66 | ||
67 | void testRules(const char *ruleFile); | |
68 | static UBool getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status); | |
69 | static UBool getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status); | |
70 | static UBool getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status); | |
71 | ||
72 | }; | |
73 | ||
74 | // The following classes are internal to the RBBI Monkey Test implementation. | |
75 | ||
76 | ||
77 | ||
78 | // class CharClass Represents a single character class from the source break rules. | |
79 | // Inherits from UObject because instances are adopted by UHashtable, which ultimately | |
80 | // deletes them using hash's object deleter function. | |
81 | ||
82 | class CharClass: public UObject { | |
83 | public: | |
84 | UnicodeString fName; | |
85 | UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules. | |
86 | UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. | |
87 | LocalPointer<const UnicodeSet> fSet; | |
88 | CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) : | |
89 | fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {} | |
90 | }; | |
91 | ||
92 | ||
93 | // class BreakRule represents a single rule from a set of break rules. | |
94 | // Each rule has the set definitions expanded, and | |
95 | // is compiled to a regular expression. | |
96 | ||
97 | class BreakRule: public UObject { | |
98 | public: | |
99 | BreakRule(); | |
100 | ~BreakRule(); | |
101 | UnicodeString fName; // Name of the rule. | |
102 | UnicodeString fRule; // Rule expression, excluding the name, as written in user source. | |
103 | UnicodeString fExpandedRule; // Rule expression after expanding the set definitions. | |
104 | LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule. | |
105 | }; | |
106 | ||
107 | ||
108 | // class BreakRules represents a complete set of break rules, possibly tailored, | |
109 | // compiled from testdata break rules. | |
110 | ||
111 | class BreakRules: public UObject { | |
112 | public: | |
113 | BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status); | |
114 | ~BreakRules(); | |
115 | ||
116 | void compileRules(UCHARBUF *rules, UErrorCode &status); | |
117 | ||
118 | const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const; | |
119 | ||
120 | ||
121 | RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. | |
122 | icu::UVector fBreakRules; // Contents are of type (BreakRule *). | |
123 | ||
124 | LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString). | |
125 | // Value is (CharClass *) | |
126 | LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values, | |
127 | // but in a vector so they can be accessed by index. | |
128 | UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. | |
129 | Locale fLocale; | |
130 | UBreakIteratorType fType; | |
131 | ||
132 | CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); | |
133 | void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); | |
134 | bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status); | |
135 | RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status); | |
136 | ||
137 | LocalPointer<RegexMatcher> fSetRefsMatcher; | |
138 | LocalPointer<RegexMatcher> fCommentsMatcher; | |
139 | LocalPointer<RegexMatcher> fClassDefMatcher; | |
140 | LocalPointer<RegexMatcher> fRuleDefMatcher; | |
141 | }; | |
142 | ||
143 | ||
144 | // class MonkeyTestData represents a randomly synthesized test data string together | |
145 | // with the expected break positions obtained by applying | |
146 | // the test break rules. | |
147 | ||
148 | class MonkeyTestData: public UObject { | |
149 | public: | |
150 | MonkeyTestData() {}; | |
151 | ~MonkeyTestData() {}; | |
152 | void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status); | |
153 | void clearActualBreaks(); | |
154 | void dump(int32_t around = -1) const; | |
155 | ||
156 | uint32_t fRandomSeed; // The initial seed value from the random number genererator. | |
157 | const BreakRules *fBkRules; // The break rules used to generate this data. | |
158 | UnicodeString fString; // The text. | |
159 | UnicodeString fExpectedBreaks; // Breaks as found by the reference rules. | |
160 | // Parallel to fString. Non-zero if break preceding. | |
161 | UnicodeString fActualBreaks; // Breaks as found by ICU break iterator. | |
162 | UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position. | |
163 | // Also parallel to fString. | |
164 | UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule | |
165 | // didn't cause a break, and a subsequent rule match starts | |
166 | // on the last code point of the preceding match. | |
167 | ||
168 | }; | |
169 | ||
170 | ||
171 | ||
172 | ||
173 | // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey | |
174 | // test for one set of break rules. | |
175 | // | |
176 | // When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence | |
177 | // between instances of RBBIMonkeyImpl and threads. | |
178 | // | |
179 | class RBBIMonkeyImpl: public UObject { | |
180 | public: | |
181 | RBBIMonkeyImpl(UErrorCode &status); | |
182 | ~RBBIMonkeyImpl(); | |
183 | ||
184 | void setup(const char *ruleFileName, UErrorCode &status); | |
185 | ||
186 | void startTest(); | |
187 | void runTest(); | |
188 | void join(); | |
189 | ||
190 | LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules. | |
191 | LocalPointer<BreakRules> fRuleSet; | |
192 | LocalPointer<RuleBasedBreakIterator> fBI; | |
193 | LocalPointer<MonkeyTestData> fTestData; | |
194 | IntlTest::icu_rand fRandomGenerator; | |
195 | const char *fRuleFileName; | |
196 | UBool fVerbose; // True to do long dump of failing data. | |
197 | int32_t fLoopCount; | |
198 | ||
199 | UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets. | |
200 | ||
201 | enum CheckDirection { | |
202 | FORWARD = 1, | |
203 | REVERSE = 2 | |
204 | }; | |
205 | void clearActualBreaks(); | |
206 | void testForwards(UErrorCode &status); | |
207 | void testPrevious(UErrorCode &status); | |
208 | void testFollowing(UErrorCode &status); | |
209 | void testPreceding(UErrorCode &status); | |
210 | void testIsBoundary(UErrorCode &status); | |
0f5d89e8 | 211 | void testIsBoundaryRandom(UErrorCode &status); |
2ca993e8 A |
212 | void checkResults(const char *msg, CheckDirection dir, UErrorCode &status); |
213 | ||
214 | class RBBIMonkeyThread: public SimpleThread { | |
215 | private: | |
216 | RBBIMonkeyImpl *fMonkeyImpl; | |
217 | public: | |
218 | RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {}; | |
219 | void run() U_OVERRIDE { fMonkeyImpl->runTest(); }; | |
220 | }; | |
221 | private: | |
222 | void openBreakRules(const char *fileName, UErrorCode &status); | |
223 | RBBIMonkeyThread fThread; | |
224 | ||
225 | }; | |
226 | ||
227 | #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */ | |
228 | ||
229 | #endif // RBBIMONKEYTEST_H |