]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbirb.h
ICU-64232.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbirb.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // rbbirb.h
5 //
6 // Copyright (C) 2002-2008, International Business Machines Corporation and others.
7 // All Rights Reserved.
8 //
9 // This file contains declarations for several classes from the
10 // Rule Based Break Iterator rule builder.
11 //
12
13
14 #ifndef RBBIRB_H
15 #define RBBIRB_H
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_BREAK_ITERATION
20
21 #include <utility>
22
23 #include "unicode/uobject.h"
24 #include "unicode/rbbi.h"
25 #include "unicode/uniset.h"
26 #include "unicode/parseerr.h"
27 #include "uhash.h"
28 #include "uvector.h"
29 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
30 // looks up references to $variables within a set.
31
32
33 U_NAMESPACE_BEGIN
34
35 class RBBIRuleScanner;
36 struct RBBIRuleTableEl;
37 class RBBISetBuilder;
38 class RBBINode;
39 class RBBITableBuilder;
40
41
42
43 //--------------------------------------------------------------------------------
44 //
45 // RBBISymbolTable. Implements SymbolTable interface that is used by the
46 // UnicodeSet parser to resolve references to $variables.
47 //
48 //--------------------------------------------------------------------------------
49 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
50 public: // of these structs for each entry.
51 RBBISymbolTableEntry();
52 UnicodeString key;
53 RBBINode *val;
54 ~RBBISymbolTableEntry();
55
56 private:
57 RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
58 RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
59 };
60
61
62 class RBBISymbolTable : public UMemory, public SymbolTable {
63 private:
64 const UnicodeString &fRules;
65 UHashtable *fHashTable;
66 RBBIRuleScanner *fRuleScanner;
67
68 // These next two fields are part of the mechanism for passing references to
69 // already-constructed UnicodeSets back to the UnicodeSet constructor
70 // when the pattern includes $variable references.
71 const UnicodeString ffffString; // = "/uffff"
72 UnicodeSet *fCachedSetLookup;
73
74 public:
75 // API inherited from class SymbolTable
76 virtual const UnicodeString* lookup(const UnicodeString& s) const;
77 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
78 virtual UnicodeString parseReference(const UnicodeString& text,
79 ParsePosition& pos, int32_t limit) const;
80
81 // Additional Functions
82 RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
83 virtual ~RBBISymbolTable();
84
85 virtual RBBINode *lookupNode(const UnicodeString &key) const;
86 virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err);
87
88 #ifdef RBBI_DEBUG
89 virtual void rbbiSymtablePrint() const;
90 #else
91 // A do-nothing inline function for non-debug builds. Member funcs can't be empty
92 // or the call sites won't compile.
93 int32_t fFakeField;
94 #define rbbiSymtablePrint() fFakeField=0;
95 #endif
96
97 private:
98 RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
99 RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
100 };
101
102
103 //--------------------------------------------------------------------------------
104 //
105 // class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
106 //
107 //--------------------------------------------------------------------------------
108 class RBBIRuleBuilder : public UMemory {
109 public:
110
111 // Create a rule based break iterator from a set of rules.
112 // This function is the main entry point into the rule builder. The
113 // public ICU API for creating RBBIs uses this function to do the actual work.
114 //
115 static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules,
116 UParseError *parseError,
117 UErrorCode &status);
118
119 public:
120 // The "public" functions and data members that appear below are accessed
121 // (and shared) by the various parts that make up the rule builder. They
122 // are NOT intended to be accessed by anything outside of the
123 // rule builder implementation.
124 RBBIRuleBuilder(const UnicodeString &rules,
125 UParseError *parseErr,
126 UErrorCode &status
127 );
128
129 virtual ~RBBIRuleBuilder();
130
131 /**
132 * Build the state tables and char class Trie from the source rules.
133 */
134 RBBIDataHeader *build(UErrorCode &status);
135
136
137 /**
138 * Fold together redundant character classes (table columns) and
139 * redundant states (table rows). Done after initial table generation,
140 * before serializing the result.
141 */
142 void optimizeTables();
143
144 char *fDebugEnv; // controls debug trace output
145 UErrorCode *fStatus; // Error reporting. Keeping status
146 UParseError *fParseError; // here avoids passing it everywhere.
147 const UnicodeString &fRules; // The rule string that we are compiling
148 UnicodeString fStrippedRules; // The rule string, with comments stripped.
149
150 RBBIRuleScanner *fScanner; // The scanner.
151 RBBINode *fForwardTree; // The parse trees, generated by the scanner,
152 RBBINode *fReverseTree; // then manipulated by subsequent steps.
153 RBBINode *fSafeFwdTree;
154 RBBINode *fSafeRevTree;
155
156 RBBINode **fDefaultTree; // For rules not qualified with a !
157 // the tree to which they belong to.
158
159 UBool fChainRules; // True for chained Unicode TR style rules.
160 // False for traditional regexp rules.
161
162 UBool fLBCMNoChain; // True: suppress chaining of rules on
163 // chars with LineBreak property == CM.
164
165 UBool fLookAheadHardBreak; // True: Look ahead matches cause an
166 // immediate break, no continuing for the
167 // longest match.
168
169 RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
170 UVector *fUSetNodes; // Vector of all uset nodes.
171
172 RBBITableBuilder *fForwardTable; // State transition table, build time form.
173
174 UVector *fRuleStatusVals; // The values that can be returned
175 // from getRuleStatus().
176
177 RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
178 // data tables..
179 private:
180 RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
181 RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
182 };
183
184
185
186
187 //----------------------------------------------------------------------------
188 //
189 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have
190 // been encountered. The val Node will be of nodetype uset
191 // and contain pointers to the actual UnicodeSets.
192 // The Key is the source string for initializing the set.
193 //
194 // The hash table is used to avoid creating duplicate
195 // unnamed (not $var references) UnicodeSets.
196 //
197 // Memory Management:
198 // The Hash Table owns these RBBISetTableEl structs and
199 // the key strings. It does NOT own the val nodes.
200 //
201 //----------------------------------------------------------------------------
202 struct RBBISetTableEl {
203 UnicodeString *key;
204 RBBINode *val;
205 };
206
207 /**
208 * A pair of ints, used to bundle pairs of states or pairs of character classes.
209 */
210 typedef std::pair<int32_t, int32_t> IntPair;
211
212
213 //----------------------------------------------------------------------------
214 //
215 // RBBIDebugPrintf Printf equivalent, for debugging output.
216 // Conditional compilation of the implementation lets us
217 // get rid of the stdio dependency in environments where it
218 // is unavailable.
219 //
220 //----------------------------------------------------------------------------
221 #ifdef RBBI_DEBUG
222 #include <stdio.h>
223 #define RBBIDebugPrintf printf
224 #define RBBIDebugPuts puts
225 #else
226 #undef RBBIDebugPrintf
227 #define RBBIDebugPuts(arg)
228 #endif
229
230 U_NAMESPACE_END
231
232 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
233
234 #endif
235
236
237