]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | // | |
4 | // rbbisetb.h | |
5 | /* | |
6 | ********************************************************************** | |
7 | * Copyright (c) 2001-2005, International Business Machines | |
8 | * Corporation and others. All Rights Reserved. | |
9 | ********************************************************************** | |
10 | */ | |
11 | ||
12 | #ifndef RBBISETB_H | |
13 | #define RBBISETB_H | |
14 | ||
15 | #include "unicode/utypes.h" | |
16 | ||
17 | #if !UCONFIG_NO_BREAK_ITERATION | |
18 | ||
19 | #include "unicode/uobject.h" | |
20 | #include "rbbirb.h" | |
21 | #include "utrie2.h" | |
22 | #include "uvector.h" | |
23 | ||
24 | U_NAMESPACE_BEGIN | |
25 | ||
26 | // | |
27 | // RBBISetBuilder Derives the character categories used by the runtime RBBI engine | |
28 | // from the Unicode Sets appearing in the source RBBI rules, and | |
29 | // creates the TRIE table used to map from Unicode to the | |
30 | // character categories. | |
31 | // | |
32 | ||
33 | ||
34 | // | |
35 | // RangeDescriptor | |
36 | // | |
37 | // Each of the non-overlapping character ranges gets one of these descriptors. | |
38 | // All of them are strung together in a linked list, which is kept in order | |
39 | // (by character) | |
40 | // | |
41 | class RangeDescriptor : public UMemory { | |
42 | public: | |
43 | UChar32 fStartChar; // Start of range, unicode 32 bit value. | |
44 | UChar32 fEndChar; // End of range, unicode 32 bit value. | |
45 | int32_t fNum; // runtime-mapped input value for this range. | |
46 | UVector *fIncludesSets; // vector of the the original | |
47 | // Unicode sets that include this range. | |
48 | // (Contains ptrs to uset nodes) | |
49 | RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. | |
50 | ||
51 | RangeDescriptor(UErrorCode &status); | |
52 | RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); | |
53 | ~RangeDescriptor(); | |
54 | void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with | |
55 | // where appearing in the second (higher) part. | |
56 | void setDictionaryFlag(); // Check whether this range appears as part of | |
57 | // the Unicode set named "dictionary" | |
58 | ||
59 | private: | |
60 | RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class | |
61 | RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class | |
62 | }; | |
63 | ||
64 | ||
65 | // | |
66 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. | |
67 | // | |
68 | // Starting with the rules parse tree from the scanner, | |
69 | // | |
70 | // - Enumerate the set of UnicodeSets that are referenced | |
71 | // by the RBBI rules. | |
72 | // - compute a derived set of non-overlapping UnicodeSets | |
73 | // that will correspond to columns in the state table for | |
74 | // the RBBI execution engine. | |
75 | // - construct the trie table that maps input characters | |
76 | // to set numbers in the non-overlapping set of sets. | |
77 | // | |
78 | ||
79 | ||
80 | class RBBISetBuilder : public UMemory { | |
81 | public: | |
82 | RBBISetBuilder(RBBIRuleBuilder *rb); | |
83 | ~RBBISetBuilder(); | |
84 | ||
85 | void buildRanges(); | |
86 | void buildTrie(); | |
87 | void addValToSets(UVector *sets, uint32_t val); | |
88 | void addValToSet (RBBINode *usetNode, uint32_t val); | |
89 | int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the | |
90 | // runtime state machine, which are the same as | |
91 | // columns in the DFA state table | |
92 | int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. | |
93 | void serializeTrie(uint8_t *where); // write out the serialized Trie. | |
94 | UChar32 getFirstChar(int32_t val) const; | |
95 | UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo | |
96 | // character were encountered. | |
97 | /** | |
98 | * Merge two character categories that have been identified as having equivalent behavior. | |
99 | * The ranges belonging to the second category (table column) will be added to the first. | |
100 | * @param categories the pair of categories to be merged. | |
101 | */ | |
102 | void mergeCategories(IntPair categories); | |
103 | ||
104 | static constexpr int32_t DICT_BIT = 0x4000; | |
105 | ||
106 | #ifdef RBBI_DEBUG | |
107 | void printSets(); | |
108 | void printRanges(); | |
109 | void printRangeGroups(); | |
110 | #else | |
111 | #define printSets() | |
112 | #define printRanges() | |
113 | #define printRangeGroups() | |
114 | #endif | |
115 | ||
116 | private: | |
117 | void numberSets(); | |
118 | ||
119 | RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. | |
120 | UErrorCode *fStatus; | |
121 | ||
122 | RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors | |
123 | ||
124 | UTrie2 *fTrie; // The mapping TRIE that is the end result of processing | |
125 | uint32_t fTrieSize; // the Unicode Sets. | |
126 | ||
127 | // Groups correspond to character categories - | |
128 | // groups of ranges that are in the same original UnicodeSets. | |
129 | // fGroupCount is the index of the last used group. | |
130 | // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. | |
131 | // State table column 0 is not used. Column 1 is for end-of-input. | |
132 | // column 2 is for group 0. Funny counting. | |
133 | int32_t fGroupCount; | |
134 | ||
135 | UBool fSawBOF; | |
136 | ||
137 | RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class | |
138 | RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class | |
139 | }; | |
140 | ||
141 | ||
142 | ||
143 | U_NAMESPACE_END | |
144 | ||
145 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
146 | ||
147 | #endif |