]>
Commit | Line | Data |
---|---|---|
0f5d89e8 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ********************************************************************** | |
5 | * Copyright (c) 2001-2005, International Business Machines | |
6 | * Corporation and others. All Rights Reserved. | |
7 | ********************************************************************** | |
8 | * | |
9 | ********************************************************************** | |
10 | * Legacy version of RangeDescriptor and RBBISetBuilder from ICU 57, | |
11 | * only for use by Apple RuleBasedTokenizer | |
12 | ********************************************************************** | |
13 | */ | |
14 | ||
15 | #ifndef RBBISETB57_H | |
16 | #define RBBISETB57_H | |
17 | ||
18 | #include "unicode/utypes.h" | |
19 | #include "unicode/uobject.h" | |
20 | #include "rbbirb57.h" | |
21 | #include "rbbinode.h" | |
22 | #include "uvector.h" | |
23 | ||
24 | struct UNewTrie; | |
25 | ||
26 | U_NAMESPACE_BEGIN | |
27 | ||
28 | class RBBIRuleBuilder57; | |
29 | ||
30 | // | |
31 | // RBBISetBuilder57 Derives the character categories used by the runtime RBBI engine | |
32 | // from the Unicode Sets appearing in the source RBBI rules, and | |
33 | // creates the TRIE table used to map from Unicode to the | |
34 | // character categories. | |
35 | // | |
36 | ||
37 | ||
38 | // | |
39 | // RangeDescriptor57 | |
40 | // | |
41 | // Each of the non-overlapping character ranges gets one of these descriptors. | |
42 | // All of them are strung together in a linked list, which is kept in order | |
43 | // (by character) | |
44 | // | |
45 | class RangeDescriptor57 : public UMemory { | |
46 | public: | |
47 | UChar32 fStartChar; // Start of range, unicode 32 bit value. | |
48 | UChar32 fEndChar; // End of range, unicode 32 bit value. | |
49 | int32_t fNum; // runtime-mapped input value for this range. | |
50 | UVector *fIncludesSets; // vector of the the original | |
51 | // Unicode sets that include this range. | |
52 | // (Contains ptrs to uset nodes) | |
53 | RangeDescriptor57 *fNext; // Next RangeDescriptor57 in the linked list. | |
54 | ||
55 | RangeDescriptor57(UErrorCode &status); | |
56 | RangeDescriptor57(const RangeDescriptor57 &other, UErrorCode &status); | |
57 | ~RangeDescriptor57(); | |
58 | void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with | |
59 | // where appearing in the second (higher) part. | |
60 | void setDictionaryFlag(); // Check whether this range appears as part of | |
61 | // the Unicode set named "dictionary" | |
62 | ||
63 | private: | |
64 | RangeDescriptor57(const RangeDescriptor57 &other); // forbid copying of this class | |
65 | RangeDescriptor57 &operator=(const RangeDescriptor57 &other); // forbid copying of this class | |
66 | }; | |
67 | ||
68 | ||
69 | // | |
70 | // RBBISetBuilder57 Handles processing of Unicode Sets from RBBI rules. | |
71 | // | |
72 | // Starting with the rules parse tree from the scanner, | |
73 | // | |
74 | // - Enumerate the set of UnicodeSets that are referenced | |
75 | // by the RBBI rules. | |
76 | // - compute a derived set of non-overlapping UnicodeSets | |
77 | // that will correspond to columns in the state table for | |
78 | // the RBBI execution engine. | |
79 | // - construct the trie table that maps input characters | |
80 | // to set numbers in the non-overlapping set of sets. | |
81 | // | |
82 | ||
83 | ||
84 | class RBBISetBuilder57 : public UMemory { | |
85 | public: | |
86 | RBBISetBuilder57(RBBIRuleBuilder57 *rb); | |
87 | ~RBBISetBuilder57(); | |
88 | ||
89 | void build(); | |
90 | void addValToSets(UVector *sets, uint32_t val); | |
91 | void addValToSet (RBBINode *usetNode, uint32_t val); | |
92 | int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the | |
93 | // runtime state machine, which are the same as | |
94 | // columns in the DFA state table | |
95 | int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. | |
96 | void serializeTrie(uint8_t *where); // write out the serialized Trie. | |
97 | UChar32 getFirstChar(int32_t val) const; | |
98 | UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo | |
99 | // character were encountered. | |
100 | #ifdef RBBI_DEBUG | |
101 | void printSets(); | |
102 | void printRanges(); | |
103 | void printRangeGroups(); | |
104 | #else | |
105 | #define printSets() | |
106 | #define printRanges() | |
107 | #define printRangeGroups() | |
108 | #endif | |
109 | ||
110 | private: | |
111 | void numberSets(); | |
112 | ||
113 | RBBIRuleBuilder57 *fRB; // The RBBI Rule Compiler that owns us. | |
114 | UErrorCode *fStatus; | |
115 | ||
116 | RangeDescriptor57 *fRangeList; // Head of the linked list of RangeDescriptors | |
117 | ||
118 | UNewTrie *fTrie; // The mapping TRIE that is the end result of processing | |
119 | uint32_t fTrieSize; // the Unicode Sets. | |
120 | ||
121 | // Groups correspond to character categories - | |
122 | // groups of ranges that are in the same original UnicodeSets. | |
123 | // fGroupCount is the index of the last used group. | |
124 | // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. | |
125 | // State table column 0 is not used. Column 1 is for end-of-input. | |
126 | // column 2 is for group 0. Funny counting. | |
127 | int32_t fGroupCount; | |
128 | ||
129 | UBool fSawBOF; | |
130 | ||
131 | RBBISetBuilder57(const RBBISetBuilder57 &other); // forbid copying of this class | |
132 | RBBISetBuilder57 &operator=(const RBBISetBuilder57 &other); // forbid copying of this class | |
133 | }; | |
134 | ||
135 | ||
136 | ||
137 | U_NAMESPACE_END | |
138 | #endif |