]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | // |
4 | // rbbirb.h | |
5 | // | |
46f4442e | 6 | // Copyright (C) 2002-2008, International Business Machines Corporation and others. |
b75a7d8f A |
7 | // All Rights Reserved. |
8 | // | |
374ca955 A |
9 | // This file contains declarations for several classes from the |
10 | // Rule Based Break Iterator rule builder. | |
b75a7d8f A |
11 | // |
12 | ||
13 | ||
14 | #ifndef RBBIRB_H | |
15 | #define RBBIRB_H | |
16 | ||
17 | #include "unicode/utypes.h" | |
0f5d89e8 A |
18 | |
19 | #if !UCONFIG_NO_BREAK_ITERATION | |
20 | ||
21 | #include <utility> | |
22 | ||
b75a7d8f A |
23 | #include "unicode/uobject.h" |
24 | #include "unicode/rbbi.h" | |
25 | #include "unicode/uniset.h" | |
26 | #include "unicode/parseerr.h" | |
27 | #include "uhash.h" | |
28 | #include "uvector.h" | |
374ca955 | 29 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
0f5d89e8 | 30 | // looks up references to $variables within a set. |
b75a7d8f A |
31 | |
32 | ||
33 | U_NAMESPACE_BEGIN | |
34 | ||
35 | class RBBIRuleScanner; | |
36 | struct RBBIRuleTableEl; | |
37 | class RBBISetBuilder; | |
38 | class RBBINode; | |
39 | class RBBITableBuilder; | |
40 | ||
41 | ||
42 | ||
43 | //-------------------------------------------------------------------------------- | |
44 | // | |
45 | // RBBISymbolTable. Implements SymbolTable interface that is used by the | |
46 | // UnicodeSet parser to resolve references to $variables. | |
47 | // | |
48 | //-------------------------------------------------------------------------------- | |
49 | class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one | |
50 | public: // of these structs for each entry. | |
51 | RBBISymbolTableEntry(); | |
52 | UnicodeString key; | |
53 | RBBINode *val; | |
54 | ~RBBISymbolTableEntry(); | |
55 | ||
56 | private: | |
57 | RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class | |
58 | RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class | |
59 | }; | |
60 | ||
61 | ||
62 | class RBBISymbolTable : public UMemory, public SymbolTable { | |
63 | private: | |
64 | const UnicodeString &fRules; | |
65 | UHashtable *fHashTable; | |
66 | RBBIRuleScanner *fRuleScanner; | |
67 | ||
68 | // These next two fields are part of the mechanism for passing references to | |
69 | // already-constructed UnicodeSets back to the UnicodeSet constructor | |
70 | // when the pattern includes $variable references. | |
71 | const UnicodeString ffffString; // = "/uffff" | |
72 | UnicodeSet *fCachedSetLookup; | |
73 | ||
74 | public: | |
75 | // API inherited from class SymbolTable | |
76 | virtual const UnicodeString* lookup(const UnicodeString& s) const; | |
77 | virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; | |
78 | virtual UnicodeString parseReference(const UnicodeString& text, | |
79 | ParsePosition& pos, int32_t limit) const; | |
80 | ||
81 | // Additional Functions | |
82 | RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); | |
83 | virtual ~RBBISymbolTable(); | |
84 | ||
85 | virtual RBBINode *lookupNode(const UnicodeString &key) const; | |
86 | virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); | |
87 | ||
374ca955 A |
88 | #ifdef RBBI_DEBUG |
89 | virtual void rbbiSymtablePrint() const; | |
90 | #else | |
91 | // A do-nothing inline function for non-debug builds. Member funcs can't be empty | |
92 | // or the call sites won't compile. | |
73c04bcf | 93 | int32_t fFakeField; |
374ca955 A |
94 | #define rbbiSymtablePrint() fFakeField=0; |
95 | #endif | |
b75a7d8f A |
96 | |
97 | private: | |
98 | RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class | |
99 | RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class | |
100 | }; | |
101 | ||
102 | ||
103 | //-------------------------------------------------------------------------------- | |
104 | // | |
105 | // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. | |
106 | // | |
107 | //-------------------------------------------------------------------------------- | |
108 | class RBBIRuleBuilder : public UMemory { | |
109 | public: | |
110 | ||
111 | // Create a rule based break iterator from a set of rules. | |
112 | // This function is the main entry point into the rule builder. The | |
113 | // public ICU API for creating RBBIs uses this function to do the actual work. | |
114 | // | |
115 | static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, | |
46f4442e | 116 | UParseError *parseError, |
b75a7d8f A |
117 | UErrorCode &status); |
118 | ||
119 | public: | |
120 | // The "public" functions and data members that appear below are accessed | |
121 | // (and shared) by the various parts that make up the rule builder. They | |
122 | // are NOT intended to be accessed by anything outside of the | |
123 | // rule builder implementation. | |
124 | RBBIRuleBuilder(const UnicodeString &rules, | |
46f4442e | 125 | UParseError *parseErr, |
b75a7d8f | 126 | UErrorCode &status |
0f5d89e8 | 127 | ); |
b75a7d8f A |
128 | |
129 | virtual ~RBBIRuleBuilder(); | |
0f5d89e8 A |
130 | |
131 | /** | |
132 | * Build the state tables and char class Trie from the source rules. | |
133 | */ | |
134 | RBBIDataHeader *build(UErrorCode &status); | |
135 | ||
136 | ||
137 | /** | |
138 | * Fold together redundant character classes (table columns) and | |
139 | * redundant states (table rows). Done after initial table generation, | |
140 | * before serializing the result. | |
141 | */ | |
142 | void optimizeTables(); | |
143 | ||
b75a7d8f A |
144 | char *fDebugEnv; // controls debug trace output |
145 | UErrorCode *fStatus; // Error reporting. Keeping status | |
146 | UParseError *fParseError; // here avoids passing it everywhere. | |
147 | const UnicodeString &fRules; // The rule string that we are compiling | |
0f5d89e8 | 148 | UnicodeString fStrippedRules; // The rule string, with comments stripped. |
b75a7d8f A |
149 | |
150 | RBBIRuleScanner *fScanner; // The scanner. | |
151 | RBBINode *fForwardTree; // The parse trees, generated by the scanner, | |
152 | RBBINode *fReverseTree; // then manipulated by subsequent steps. | |
374ca955 A |
153 | RBBINode *fSafeFwdTree; |
154 | RBBINode *fSafeRevTree; | |
155 | ||
156 | RBBINode **fDefaultTree; // For rules not qualified with a ! | |
157 | // the tree to which they belong to. | |
158 | ||
159 | UBool fChainRules; // True for chained Unicode TR style rules. | |
160 | // False for traditional regexp rules. | |
161 | ||
162 | UBool fLBCMNoChain; // True: suppress chaining of rules on | |
163 | // chars with LineBreak property == CM. | |
164 | ||
165 | UBool fLookAheadHardBreak; // True: Look ahead matches cause an | |
166 | // immediate break, no continuing for the | |
167 | // longest match. | |
b75a7d8f A |
168 | |
169 | RBBISetBuilder *fSetBuilder; // Set and Character Category builder. | |
170 | UVector *fUSetNodes; // Vector of all uset nodes. | |
171 | ||
0f5d89e8 | 172 | RBBITableBuilder *fForwardTable; // State transition table, build time form. |
374ca955 A |
173 | |
174 | UVector *fRuleStatusVals; // The values that can be returned | |
175 | // from getRuleStatus(). | |
b75a7d8f A |
176 | |
177 | RBBIDataHeader *flattenData(); // Create the flattened (runtime format) | |
178 | // data tables.. | |
179 | private: | |
180 | RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class | |
181 | RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class | |
182 | }; | |
183 | ||
184 | ||
185 | ||
186 | ||
187 | //---------------------------------------------------------------------------- | |
188 | // | |
189 | // RBBISetTableEl is an entry in the hash table of UnicodeSets that have | |
190 | // been encountered. The val Node will be of nodetype uset | |
191 | // and contain pointers to the actual UnicodeSets. | |
192 | // The Key is the source string for initializing the set. | |
193 | // | |
194 | // The hash table is used to avoid creating duplicate | |
195 | // unnamed (not $var references) UnicodeSets. | |
196 | // | |
197 | // Memory Management: | |
198 | // The Hash Table owns these RBBISetTableEl structs and | |
199 | // the key strings. It does NOT own the val nodes. | |
200 | // | |
201 | //---------------------------------------------------------------------------- | |
202 | struct RBBISetTableEl { | |
203 | UnicodeString *key; | |
204 | RBBINode *val; | |
205 | }; | |
206 | ||
0f5d89e8 A |
207 | /** |
208 | * A pair of ints, used to bundle pairs of states or pairs of character classes. | |
209 | */ | |
210 | typedef std::pair<int32_t, int32_t> IntPair; | |
211 | ||
b75a7d8f A |
212 | |
213 | //---------------------------------------------------------------------------- | |
214 | // | |
215 | // RBBIDebugPrintf Printf equivalent, for debugging output. | |
216 | // Conditional compilation of the implementation lets us | |
217 | // get rid of the stdio dependency in environments where it | |
218 | // is unavailable. | |
219 | // | |
220 | //---------------------------------------------------------------------------- | |
221 | #ifdef RBBI_DEBUG | |
222 | #include <stdio.h> | |
223 | #define RBBIDebugPrintf printf | |
374ca955 | 224 | #define RBBIDebugPuts puts |
b75a7d8f | 225 | #else |
73c04bcf | 226 | #undef RBBIDebugPrintf |
374ca955 | 227 | #define RBBIDebugPuts(arg) |
b75a7d8f A |
228 | #endif |
229 | ||
230 | U_NAMESPACE_END | |
0f5d89e8 A |
231 | |
232 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
233 | ||
b75a7d8f A |
234 | #endif |
235 | ||
236 | ||
237 |