4 // Copyright (C) 2002-2005, International Business Machines Corporation and others.
5 // All Rights Reserved.
7 // This file contains declarations for several classes from the
8 // Rule Based Break Iterator rule builder.
15 #include "unicode/utypes.h"
16 #include "unicode/uobject.h"
17 #include "unicode/rbbi.h"
18 #include "unicode/uniset.h"
19 #include "unicode/parseerr.h"
22 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
23 // looks up references to $variables within a set.
29 class RBBIRuleScanner
;
30 struct RBBIRuleTableEl
;
33 class RBBITableBuilder
;
37 //--------------------------------------------------------------------------------
39 // RBBISymbolTable. Implements SymbolTable interface that is used by the
40 // UnicodeSet parser to resolve references to $variables.
42 //--------------------------------------------------------------------------------
43 class RBBISymbolTableEntry
: public UMemory
{ // The symbol table hash table contains one
44 public: // of these structs for each entry.
45 RBBISymbolTableEntry();
48 ~RBBISymbolTableEntry();
51 RBBISymbolTableEntry(const RBBISymbolTableEntry
&other
); // forbid copying of this class
52 RBBISymbolTableEntry
&operator=(const RBBISymbolTableEntry
&other
); // forbid copying of this class
56 class RBBISymbolTable
: public UMemory
, public SymbolTable
{
58 const UnicodeString
&fRules
;
59 UHashtable
*fHashTable
;
60 RBBIRuleScanner
*fRuleScanner
;
62 // These next two fields are part of the mechanism for passing references to
63 // already-constructed UnicodeSets back to the UnicodeSet constructor
64 // when the pattern includes $variable references.
65 const UnicodeString ffffString
; // = "/uffff"
66 UnicodeSet
*fCachedSetLookup
;
69 // API inherited from class SymbolTable
70 virtual const UnicodeString
* lookup(const UnicodeString
& s
) const;
71 virtual const UnicodeFunctor
* lookupMatcher(UChar32 ch
) const;
72 virtual UnicodeString
parseReference(const UnicodeString
& text
,
73 ParsePosition
& pos
, int32_t limit
) const;
75 // Additional Functions
76 RBBISymbolTable(RBBIRuleScanner
*, const UnicodeString
&fRules
, UErrorCode
&status
);
77 virtual ~RBBISymbolTable();
79 virtual RBBINode
*lookupNode(const UnicodeString
&key
) const;
80 virtual void addEntry (const UnicodeString
&key
, RBBINode
*val
, UErrorCode
&err
);
83 virtual void rbbiSymtablePrint() const;
85 // A do-nothing inline function for non-debug builds. Member funcs can't be empty
86 // or the call sites won't compile.
88 #define rbbiSymtablePrint() fFakeField=0;
92 RBBISymbolTable(const RBBISymbolTable
&other
); // forbid copying of this class
93 RBBISymbolTable
&operator=(const RBBISymbolTable
&other
); // forbid copying of this class
97 //--------------------------------------------------------------------------------
99 // class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
101 //--------------------------------------------------------------------------------
102 class RBBIRuleBuilder
: public UMemory
{
105 // Create a rule based break iterator from a set of rules.
106 // This function is the main entry point into the rule builder. The
107 // public ICU API for creating RBBIs uses this function to do the actual work.
109 static BreakIterator
* createRuleBasedBreakIterator( const UnicodeString
&rules
,
110 UParseError
&parseError
,
114 // The "public" functions and data members that appear below are accessed
115 // (and shared) by the various parts that make up the rule builder. They
116 // are NOT intended to be accessed by anything outside of the
117 // rule builder implementation.
118 RBBIRuleBuilder(const UnicodeString
&rules
,
119 UParseError
&parseErr
,
123 virtual ~RBBIRuleBuilder();
124 char *fDebugEnv
; // controls debug trace output
125 UErrorCode
*fStatus
; // Error reporting. Keeping status
126 UParseError
*fParseError
; // here avoids passing it everywhere.
127 const UnicodeString
&fRules
; // The rule string that we are compiling
129 RBBIRuleScanner
*fScanner
; // The scanner.
130 RBBINode
*fForwardTree
; // The parse trees, generated by the scanner,
131 RBBINode
*fReverseTree
; // then manipulated by subsequent steps.
132 RBBINode
*fSafeFwdTree
;
133 RBBINode
*fSafeRevTree
;
135 RBBINode
**fDefaultTree
; // For rules not qualified with a !
136 // the tree to which they belong to.
138 UBool fChainRules
; // True for chained Unicode TR style rules.
139 // False for traditional regexp rules.
141 UBool fLBCMNoChain
; // True: suppress chaining of rules on
142 // chars with LineBreak property == CM.
144 UBool fLookAheadHardBreak
; // True: Look ahead matches cause an
145 // immediate break, no continuing for the
148 RBBISetBuilder
*fSetBuilder
; // Set and Character Category builder.
149 UVector
*fUSetNodes
; // Vector of all uset nodes.
151 RBBITableBuilder
*fForwardTables
; // State transition tables
152 RBBITableBuilder
*fReverseTables
;
153 RBBITableBuilder
*fSafeFwdTables
;
154 RBBITableBuilder
*fSafeRevTables
;
156 UVector
*fRuleStatusVals
; // The values that can be returned
157 // from getRuleStatus().
159 RBBIDataHeader
*flattenData(); // Create the flattened (runtime format)
162 RBBIRuleBuilder(const RBBIRuleBuilder
&other
); // forbid copying of this class
163 RBBIRuleBuilder
&operator=(const RBBIRuleBuilder
&other
); // forbid copying of this class
169 //----------------------------------------------------------------------------
171 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have
172 // been encountered. The val Node will be of nodetype uset
173 // and contain pointers to the actual UnicodeSets.
174 // The Key is the source string for initializing the set.
176 // The hash table is used to avoid creating duplicate
177 // unnamed (not $var references) UnicodeSets.
179 // Memory Management:
180 // The Hash Table owns these RBBISetTableEl structs and
181 // the key strings. It does NOT own the val nodes.
183 //----------------------------------------------------------------------------
184 struct RBBISetTableEl
{
190 //----------------------------------------------------------------------------
192 // RBBIDebugPrintf Printf equivalent, for debugging output.
193 // Conditional compilation of the implementation lets us
194 // get rid of the stdio dependency in environments where it
197 //----------------------------------------------------------------------------
200 #define RBBIDebugPrintf printf
201 #define RBBIDebugPuts puts
203 #undef RBBIDebugPrintf
204 #define RBBIDebugPuts(arg)