]>
Commit | Line | Data |
---|---|---|
1 | // | |
2 | // rbbirb.h | |
3 | // | |
4 | // Copyright (C) 2002-2008, International Business Machines Corporation and others. | |
5 | // All Rights Reserved. | |
6 | // | |
7 | // This file contains declarations for several classes from the | |
8 | // Rule Based Break Iterator rule builder. | |
9 | // | |
10 | ||
11 | ||
12 | #ifndef RBBIRB_H | |
13 | #define RBBIRB_H | |
14 | ||
15 | #include "unicode/utypes.h" | |
16 | #include "unicode/uobject.h" | |
17 | #include "unicode/rbbi.h" | |
18 | #include "unicode/uniset.h" | |
19 | #include "unicode/parseerr.h" | |
20 | #include "uhash.h" | |
21 | #include "uvector.h" | |
22 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that | |
23 | // looks up references to $variables within a set. | |
24 | ||
25 | ||
26 | ||
27 | U_NAMESPACE_BEGIN | |
28 | ||
29 | class RBBIRuleScanner; | |
30 | struct RBBIRuleTableEl; | |
31 | class RBBISetBuilder; | |
32 | class RBBINode; | |
33 | class RBBITableBuilder; | |
34 | ||
35 | ||
36 | ||
37 | //-------------------------------------------------------------------------------- | |
38 | // | |
39 | // RBBISymbolTable. Implements SymbolTable interface that is used by the | |
40 | // UnicodeSet parser to resolve references to $variables. | |
41 | // | |
42 | //-------------------------------------------------------------------------------- | |
43 | class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one | |
44 | public: // of these structs for each entry. | |
45 | RBBISymbolTableEntry(); | |
46 | UnicodeString key; | |
47 | RBBINode *val; | |
48 | ~RBBISymbolTableEntry(); | |
49 | ||
50 | private: | |
51 | RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class | |
52 | RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class | |
53 | }; | |
54 | ||
55 | ||
56 | class RBBISymbolTable : public UMemory, public SymbolTable { | |
57 | private: | |
58 | const UnicodeString &fRules; | |
59 | UHashtable *fHashTable; | |
60 | RBBIRuleScanner *fRuleScanner; | |
61 | ||
62 | // These next two fields are part of the mechanism for passing references to | |
63 | // already-constructed UnicodeSets back to the UnicodeSet constructor | |
64 | // when the pattern includes $variable references. | |
65 | const UnicodeString ffffString; // = "/uffff" | |
66 | UnicodeSet *fCachedSetLookup; | |
67 | ||
68 | public: | |
69 | // API inherited from class SymbolTable | |
70 | virtual const UnicodeString* lookup(const UnicodeString& s) const; | |
71 | virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; | |
72 | virtual UnicodeString parseReference(const UnicodeString& text, | |
73 | ParsePosition& pos, int32_t limit) const; | |
74 | ||
75 | // Additional Functions | |
76 | RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); | |
77 | virtual ~RBBISymbolTable(); | |
78 | ||
79 | virtual RBBINode *lookupNode(const UnicodeString &key) const; | |
80 | virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); | |
81 | ||
82 | #ifdef RBBI_DEBUG | |
83 | virtual void rbbiSymtablePrint() const; | |
84 | #else | |
85 | // A do-nothing inline function for non-debug builds. Member funcs can't be empty | |
86 | // or the call sites won't compile. | |
87 | int32_t fFakeField; | |
88 | #define rbbiSymtablePrint() fFakeField=0; | |
89 | #endif | |
90 | ||
91 | private: | |
92 | RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class | |
93 | RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class | |
94 | }; | |
95 | ||
96 | ||
97 | //-------------------------------------------------------------------------------- | |
98 | // | |
99 | // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. | |
100 | // | |
101 | //-------------------------------------------------------------------------------- | |
102 | class RBBIRuleBuilder : public UMemory { | |
103 | public: | |
104 | ||
105 | // Create a rule based break iterator from a set of rules. | |
106 | // This function is the main entry point into the rule builder. The | |
107 | // public ICU API for creating RBBIs uses this function to do the actual work. | |
108 | // | |
109 | static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, | |
110 | UParseError *parseError, | |
111 | UErrorCode &status); | |
112 | ||
113 | public: | |
114 | // The "public" functions and data members that appear below are accessed | |
115 | // (and shared) by the various parts that make up the rule builder. They | |
116 | // are NOT intended to be accessed by anything outside of the | |
117 | // rule builder implementation. | |
118 | RBBIRuleBuilder(const UnicodeString &rules, | |
119 | UParseError *parseErr, | |
120 | UErrorCode &status | |
121 | ); | |
122 | ||
123 | virtual ~RBBIRuleBuilder(); | |
124 | char *fDebugEnv; // controls debug trace output | |
125 | UErrorCode *fStatus; // Error reporting. Keeping status | |
126 | UParseError *fParseError; // here avoids passing it everywhere. | |
127 | const UnicodeString &fRules; // The rule string that we are compiling | |
128 | ||
129 | RBBIRuleScanner *fScanner; // The scanner. | |
130 | RBBINode *fForwardTree; // The parse trees, generated by the scanner, | |
131 | RBBINode *fReverseTree; // then manipulated by subsequent steps. | |
132 | RBBINode *fSafeFwdTree; | |
133 | RBBINode *fSafeRevTree; | |
134 | ||
135 | RBBINode **fDefaultTree; // For rules not qualified with a ! | |
136 | // the tree to which they belong to. | |
137 | ||
138 | UBool fChainRules; // True for chained Unicode TR style rules. | |
139 | // False for traditional regexp rules. | |
140 | ||
141 | UBool fLBCMNoChain; // True: suppress chaining of rules on | |
142 | // chars with LineBreak property == CM. | |
143 | ||
144 | UBool fLookAheadHardBreak; // True: Look ahead matches cause an | |
145 | // immediate break, no continuing for the | |
146 | // longest match. | |
147 | ||
148 | UBool fRINoChain; // True: suppress chaining of rules on chars | |
149 | // with (grapheme/word/line)break property == RI. | |
150 | ||
151 | RBBISetBuilder *fSetBuilder; // Set and Character Category builder. | |
152 | UVector *fUSetNodes; // Vector of all uset nodes. | |
153 | ||
154 | RBBITableBuilder *fForwardTables; // State transition tables | |
155 | RBBITableBuilder *fReverseTables; | |
156 | RBBITableBuilder *fSafeFwdTables; | |
157 | RBBITableBuilder *fSafeRevTables; | |
158 | ||
159 | UVector *fRuleStatusVals; // The values that can be returned | |
160 | // from getRuleStatus(). | |
161 | ||
162 | RBBIDataHeader *flattenData(); // Create the flattened (runtime format) | |
163 | // data tables.. | |
164 | private: | |
165 | RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class | |
166 | RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class | |
167 | }; | |
168 | ||
169 | ||
170 | ||
171 | ||
172 | //---------------------------------------------------------------------------- | |
173 | // | |
174 | // RBBISetTableEl is an entry in the hash table of UnicodeSets that have | |
175 | // been encountered. The val Node will be of nodetype uset | |
176 | // and contain pointers to the actual UnicodeSets. | |
177 | // The Key is the source string for initializing the set. | |
178 | // | |
179 | // The hash table is used to avoid creating duplicate | |
180 | // unnamed (not $var references) UnicodeSets. | |
181 | // | |
182 | // Memory Management: | |
183 | // The Hash Table owns these RBBISetTableEl structs and | |
184 | // the key strings. It does NOT own the val nodes. | |
185 | // | |
186 | //---------------------------------------------------------------------------- | |
187 | struct RBBISetTableEl { | |
188 | UnicodeString *key; | |
189 | RBBINode *val; | |
190 | }; | |
191 | ||
192 | ||
193 | //---------------------------------------------------------------------------- | |
194 | // | |
195 | // RBBIDebugPrintf Printf equivalent, for debugging output. | |
196 | // Conditional compilation of the implementation lets us | |
197 | // get rid of the stdio dependency in environments where it | |
198 | // is unavailable. | |
199 | // | |
200 | //---------------------------------------------------------------------------- | |
201 | #ifdef RBBI_DEBUG | |
202 | #include <stdio.h> | |
203 | #define RBBIDebugPrintf printf | |
204 | #define RBBIDebugPuts puts | |
205 | #else | |
206 | #undef RBBIDebugPrintf | |
207 | #define RBBIDebugPuts(arg) | |
208 | #endif | |
209 | ||
210 | U_NAMESPACE_END | |
211 | #endif | |
212 | ||
213 | ||
214 |