]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbiscan57.h
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbiscan57.h
CommitLineData
0f5d89e8
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4//
5// Copyright (C) 2002-2016, International Business Machines Corporation and others.
6// All Rights Reserved.
7//
8/*
9**********************************************************************
10* Legacy version of RBBIRuleScanner from ICU 57,
11* only for use by Apple RuleBasedTokenizer
12**********************************************************************
13*/
14
15
16#ifndef RBBISCAN57_H
17#define RBBISCAN57_H
18
19#include "unicode/utypes.h"
20#include "unicode/uobject.h"
21#include "unicode/uniset.h"
22#include "unicode/parseerr.h"
23#include "uhash.h"
24#include "uvector.h"
25#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
26 // looks up references to $variables within a set.
27#include "rbbi57.h"
28#include "rbbinode.h"
29#include "rbbirpt.h"
30
31U_NAMESPACE_BEGIN
32
33class RBBIRuleBuilder57;
34class RBBISymbolTable57;
35
36
37//--------------------------------------------------------------------------------
38//
39// class RBBIRuleScanner57 does the lowest level, character-at-a-time
40// scanning of break iterator rules.
41//
42// The output of the scanner is parse trees for
43// the rule expressions and a list of all Unicode Sets
44// encountered.
45//
46//--------------------------------------------------------------------------------
47
48class RBBIRuleScanner57 : public UMemory {
49public:
50
51 enum {
52 kStackSize = 100 // The size of the state stack for
53 }; // rules parsing. Corresponds roughly
54 // to the depth of parentheses nesting
55 // that is allowed in the rules.
56
57 struct RBBIRuleChar {
58 UChar32 fChar;
59 UBool fEscaped;
60 RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
61 };
62
63 RBBIRuleScanner57(RBBIRuleBuilder57 *rb);
64
65
66 virtual ~RBBIRuleScanner57();
67
68 void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
69 // Return false if at end.
70
71 UBool push(const RBBIRuleChar &c); // Push (unget) one character.
72 // Only a single character may be pushed.
73
74 void parse(); // Parse the rules, generating two parse
75 // trees, one each for the forward and
76 // reverse rules,
77 // and a list of UnicodeSets encountered.
78
79 /**
80 * Return a rules string without unnecessary
81 * characters.
82 */
83 static UnicodeString stripRules(const UnicodeString &rules);
84private:
85
86 UBool doParseActions(int32_t a);
87 void error(UErrorCode e); // error reporting convenience function.
88 void fixOpStack(RBBINode::OpPrecedence p);
89 // a character.
90 void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
91
92 UChar32 nextCharLL();
93#ifdef RBBI_DEBUG
94 void printNodeStack(const char *title);
95#endif
96 RBBINode *pushNewNode(RBBINode::NodeType t);
97 void scanSet();
98
99
100 RBBIRuleBuilder57 *fRB; // The rule builder that we are part of.
101
102 int32_t fScanIndex; // Index of current character being processed
103 // in the rule input string.
104 int32_t fNextIndex; // Index of the next character, which
105 // is the first character not yet scanned.
106 UBool fQuoteMode; // Scan is in a 'quoted region'
107 int32_t fLineNum; // Line number in input file.
108 int32_t fCharNum; // Char position within the line.
109 UChar32 fLastChar; // Previous char, needed to count CR-LF
110 // as a single line, not two.
111
112 RBBIRuleChar fC; // Current char for parse state machine
113 // processing.
114 UnicodeString fVarName; // $variableName, valid when we've just
115 // scanned one.
116
117 RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
118 // parsing. index by p[state][char-class]
119
120 uint16_t fStack[kStackSize]; // State stack, holds state pushes
121 int32_t fStackPtr; // and pops as specified in the state
122 // transition rules.
123
124 RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
125 // during the parse of a rule
126 int32_t fNodeStackPtr;
127
128
129 UBool fReverseRule; // True if the rule currently being scanned
130 // is a reverse direction rule (if it
131 // starts with a '!')
132
133 UBool fLookAheadRule; // True if the rule includes a '/'
134 // somewhere within it.
135
136 UBool fNoChainInRule; // True if the current rule starts with a '^'.
137
138 RBBISymbolTable57 *fSymbolTable; // symbol table, holds definitions of
139 // $variable symbols.
140
141 UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
142 // the sets created while parsing rules.
143 // The key is the string used for creating
144 // the set.
145
146 UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during
147 // the scanning of RBBI rules. The
148 // indicies for these are assigned by the
149 // perl script that builds the state tables.
150 // See rbbirpt.h.
151
152 int32_t fRuleNum; // Counts each rule as it is scanned.
153
154 int32_t fOptionStart; // Input index of start of a !!option
155 // keyword, while being scanned.
156
157 UnicodeSet *gRuleSet_rule_char;
158 UnicodeSet *gRuleSet_white_space;
159 UnicodeSet *gRuleSet_name_char;
160 UnicodeSet *gRuleSet_name_start_char;
161
162 RBBIRuleScanner57(const RBBIRuleScanner57 &other); // forbid copying of this class
163 RBBIRuleScanner57 &operator=(const RBBIRuleScanner57 &other); // forbid copying of this class
164};
165
166U_NAMESPACE_END
167
168#endif