1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
6 // Copyright (C) 2002-2008, International Business Machines Corporation and others.
7 // All Rights Reserved.
9 // This file contains declarations for several classes from the
10 // Rule Based Break Iterator rule builder.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_BREAK_ITERATION
23 #include "unicode/uobject.h"
24 #include "unicode/rbbi.h"
25 #include "unicode/uniset.h"
26 #include "unicode/parseerr.h"
29 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
30 // looks up references to $variables within a set.
35 class RBBIRuleScanner
;
36 struct RBBIRuleTableEl
;
39 class RBBITableBuilder
;
43 //--------------------------------------------------------------------------------
45 // RBBISymbolTable. Implements SymbolTable interface that is used by the
46 // UnicodeSet parser to resolve references to $variables.
48 //--------------------------------------------------------------------------------
49 class RBBISymbolTableEntry
: public UMemory
{ // The symbol table hash table contains one
50 public: // of these structs for each entry.
51 RBBISymbolTableEntry();
54 ~RBBISymbolTableEntry();
57 RBBISymbolTableEntry(const RBBISymbolTableEntry
&other
); // forbid copying of this class
58 RBBISymbolTableEntry
&operator=(const RBBISymbolTableEntry
&other
); // forbid copying of this class
62 class RBBISymbolTable
: public UMemory
, public SymbolTable
{
64 const UnicodeString
&fRules
;
65 UHashtable
*fHashTable
;
66 RBBIRuleScanner
*fRuleScanner
;
68 // These next two fields are part of the mechanism for passing references to
69 // already-constructed UnicodeSets back to the UnicodeSet constructor
70 // when the pattern includes $variable references.
71 const UnicodeString ffffString
; // = "/uffff"
72 UnicodeSet
*fCachedSetLookup
;
75 // API inherited from class SymbolTable
76 virtual const UnicodeString
* lookup(const UnicodeString
& s
) const;
77 virtual const UnicodeFunctor
* lookupMatcher(UChar32 ch
) const;
78 virtual UnicodeString
parseReference(const UnicodeString
& text
,
79 ParsePosition
& pos
, int32_t limit
) const;
81 // Additional Functions
82 RBBISymbolTable(RBBIRuleScanner
*, const UnicodeString
&fRules
, UErrorCode
&status
);
83 virtual ~RBBISymbolTable();
85 virtual RBBINode
*lookupNode(const UnicodeString
&key
) const;
86 virtual void addEntry (const UnicodeString
&key
, RBBINode
*val
, UErrorCode
&err
);
89 virtual void rbbiSymtablePrint() const;
91 // A do-nothing inline function for non-debug builds. Member funcs can't be empty
92 // or the call sites won't compile.
94 #define rbbiSymtablePrint() fFakeField=0;
98 RBBISymbolTable(const RBBISymbolTable
&other
); // forbid copying of this class
99 RBBISymbolTable
&operator=(const RBBISymbolTable
&other
); // forbid copying of this class
103 //--------------------------------------------------------------------------------
105 // class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
107 //--------------------------------------------------------------------------------
108 class RBBIRuleBuilder
: public UMemory
{
111 // Create a rule based break iterator from a set of rules.
112 // This function is the main entry point into the rule builder. The
113 // public ICU API for creating RBBIs uses this function to do the actual work.
115 static BreakIterator
* createRuleBasedBreakIterator( const UnicodeString
&rules
,
116 UParseError
*parseError
,
120 // The "public" functions and data members that appear below are accessed
121 // (and shared) by the various parts that make up the rule builder. They
122 // are NOT intended to be accessed by anything outside of the
123 // rule builder implementation.
124 RBBIRuleBuilder(const UnicodeString
&rules
,
125 UParseError
*parseErr
,
129 virtual ~RBBIRuleBuilder();
132 * Build the state tables and char class Trie from the source rules.
134 RBBIDataHeader
*build(UErrorCode
&status
);
138 * Fold together redundant character classes (table columns) and
139 * redundant states (table rows). Done after initial table generation,
140 * before serializing the result.
142 void optimizeTables();
144 char *fDebugEnv
; // controls debug trace output
145 UErrorCode
*fStatus
; // Error reporting. Keeping status
146 UParseError
*fParseError
; // here avoids passing it everywhere.
147 const UnicodeString
&fRules
; // The rule string that we are compiling
148 UnicodeString fStrippedRules
; // The rule string, with comments stripped.
150 RBBIRuleScanner
*fScanner
; // The scanner.
151 RBBINode
*fForwardTree
; // The parse trees, generated by the scanner,
152 RBBINode
*fReverseTree
; // then manipulated by subsequent steps.
153 RBBINode
*fSafeFwdTree
;
154 RBBINode
*fSafeRevTree
;
156 RBBINode
**fDefaultTree
; // For rules not qualified with a !
157 // the tree to which they belong to.
159 UBool fChainRules
; // True for chained Unicode TR style rules.
160 // False for traditional regexp rules.
162 UBool fLBCMNoChain
; // True: suppress chaining of rules on
163 // chars with LineBreak property == CM.
165 UBool fLookAheadHardBreak
; // True: Look ahead matches cause an
166 // immediate break, no continuing for the
169 RBBISetBuilder
*fSetBuilder
; // Set and Character Category builder.
170 UVector
*fUSetNodes
; // Vector of all uset nodes.
172 RBBITableBuilder
*fForwardTable
; // State transition table, build time form.
174 UVector
*fRuleStatusVals
; // The values that can be returned
175 // from getRuleStatus().
177 RBBIDataHeader
*flattenData(); // Create the flattened (runtime format)
180 RBBIRuleBuilder(const RBBIRuleBuilder
&other
); // forbid copying of this class
181 RBBIRuleBuilder
&operator=(const RBBIRuleBuilder
&other
); // forbid copying of this class
187 //----------------------------------------------------------------------------
189 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have
190 // been encountered. The val Node will be of nodetype uset
191 // and contain pointers to the actual UnicodeSets.
192 // The Key is the source string for initializing the set.
194 // The hash table is used to avoid creating duplicate
195 // unnamed (not $var references) UnicodeSets.
197 // Memory Management:
198 // The Hash Table owns these RBBISetTableEl structs and
199 // the key strings. It does NOT own the val nodes.
201 //----------------------------------------------------------------------------
202 struct RBBISetTableEl
{
208 * A pair of ints, used to bundle pairs of states or pairs of character classes.
210 typedef std::pair
<int32_t, int32_t> IntPair
;
213 //----------------------------------------------------------------------------
215 // RBBIDebugPrintf Printf equivalent, for debugging output.
216 // Conditional compilation of the implementation lets us
217 // get rid of the stdio dependency in environments where it
220 //----------------------------------------------------------------------------
223 #define RBBIDebugPrintf printf
224 #define RBBIDebugPuts puts
226 #undef RBBIDebugPrintf
227 #define RBBIDebugPuts(arg)
232 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */