]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbiscan.h
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbiscan.h
CommitLineData
b75a7d8f
A
1//
2// rbbiscan.h
3//
2ca993e8 4// Copyright (C) 2002-2016, International Business Machines Corporation and others.
b75a7d8f
A
5// All Rights Reserved.
6//
7// This file contains declarations for class RBBIRuleScanner
8//
9
10
11#ifndef RBBISCAN_H
12#define RBBISCAN_H
13
14#include "unicode/utypes.h"
15#include "unicode/uobject.h"
16#include "unicode/rbbi.h"
17#include "unicode/uniset.h"
18#include "unicode/parseerr.h"
19#include "uhash.h"
20#include "uvector.h"
374ca955 21#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
b75a7d8f
A
22 // looks up references to $variables within a set.
23#include "rbbinode.h"
2ca993e8 24#include "rbbirpt.h"
b75a7d8f
A
25
26U_NAMESPACE_BEGIN
27
28class RBBIRuleBuilder;
29class RBBISymbolTable;
30
31
32//--------------------------------------------------------------------------------
33//
34// class RBBIRuleScanner does the lowest level, character-at-a-time
35// scanning of break iterator rules.
36//
37// The output of the scanner is parse trees for
38// the rule expressions and a list of all Unicode Sets
39// encountered.
40//
41//--------------------------------------------------------------------------------
b75a7d8f
A
42
43class RBBIRuleScanner : public UMemory {
44public:
45
46f4442e
A
46 enum {
47 kStackSize = 100 // The size of the state stack for
48 }; // rules parsing. Corresponds roughly
49 // to the depth of parentheses nesting
50 // that is allowed in the rules.
51
b75a7d8f
A
52 struct RBBIRuleChar {
53 UChar32 fChar;
54 UBool fEscaped;
2ca993e8 55 RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
b75a7d8f
A
56 };
57
58 RBBIRuleScanner(RBBIRuleBuilder *rb);
59
60
61 virtual ~RBBIRuleScanner();
62
63 void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
64 // Return false if at end.
65
66 UBool push(const RBBIRuleChar &c); // Push (unget) one character.
67 // Only a single character may be pushed.
68
69 void parse(); // Parse the rules, generating two parse
70 // trees, one each for the forward and
71 // reverse rules,
72 // and a list of UnicodeSets encountered.
73
74 /**
75 * Return a rules string without unnecessary
76 * characters.
77 */
78 static UnicodeString stripRules(const UnicodeString &rules);
79private:
80
46f4442e 81 UBool doParseActions(int32_t a);
b75a7d8f
A
82 void error(UErrorCode e); // error reporting convenience function.
83 void fixOpStack(RBBINode::OpPrecedence p);
84 // a character.
85 void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
86
87 UChar32 nextCharLL();
73c04bcf 88#ifdef RBBI_DEBUG
b75a7d8f 89 void printNodeStack(const char *title);
73c04bcf 90#endif
b75a7d8f
A
91 RBBINode *pushNewNode(RBBINode::NodeType t);
92 void scanSet();
93
94
95 RBBIRuleBuilder *fRB; // The rule builder that we are part of.
96
97 int32_t fScanIndex; // Index of current character being processed
98 // in the rule input string.
99 int32_t fNextIndex; // Index of the next character, which
100 // is the first character not yet scanned.
101 UBool fQuoteMode; // Scan is in a 'quoted region'
73c04bcf
A
102 int32_t fLineNum; // Line number in input file.
103 int32_t fCharNum; // Char position within the line.
b75a7d8f
A
104 UChar32 fLastChar; // Previous char, needed to count CR-LF
105 // as a single line, not two.
106
107 RBBIRuleChar fC; // Current char for parse state machine
108 // processing.
109 UnicodeString fVarName; // $variableName, valid when we've just
110 // scanned one.
111
112 RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
113 // parsing. index by p[state][char-class]
114
115 uint16_t fStack[kStackSize]; // State stack, holds state pushes
73c04bcf 116 int32_t fStackPtr; // and pops as specified in the state
b75a7d8f
A
117 // transition rules.
118
119 RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
120 // during the parse of a rule
73c04bcf 121 int32_t fNodeStackPtr;
b75a7d8f
A
122
123
124 UBool fReverseRule; // True if the rule currently being scanned
125 // is a reverse direction rule (if it
126 // starts with a '!')
127
128 UBool fLookAheadRule; // True if the rule includes a '/'
129 // somewhere within it.
130
2ca993e8
A
131 UBool fNoChainInRule; // True if the current rule starts with a '^'.
132
b75a7d8f
A
133 RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
134 // $variable symbols.
135
136 UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
137 // the sets created while parsing rules.
138 // The key is the string used for creating
139 // the set.
140
46f4442e 141 UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during
b75a7d8f
A
142 // the scanning of RBBI rules. The
143 // indicies for these are assigned by the
144 // perl script that builds the state tables.
145 // See rbbirpt.h.
146
147 int32_t fRuleNum; // Counts each rule as it is scanned.
148
374ca955
A
149 int32_t fOptionStart; // Input index of start of a !!option
150 // keyword, while being scanned.
151
b75a7d8f
A
152 UnicodeSet *gRuleSet_rule_char;
153 UnicodeSet *gRuleSet_white_space;
154 UnicodeSet *gRuleSet_name_char;
155 UnicodeSet *gRuleSet_name_start_char;
156
157 RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
158 RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
159};
160
161U_NAMESPACE_END
162
163#endif