]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | // |
2 | // rbbisetb.h | |
3 | /* | |
4 | ********************************************************************** | |
73c04bcf | 5 | * Copyright (c) 2001-2005, International Business Machines |
b75a7d8f A |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | */ | |
9 | ||
10 | #ifndef RBBISETB_H | |
11 | #define RBBISETB_H | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | #include "unicode/uobject.h" | |
15 | #include "rbbirb.h" | |
16 | #include "uvector.h" | |
b75a7d8f A |
17 | |
18 | struct UNewTrie; | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
22 | // | |
23 | // RBBISetBuilder Derives the character categories used by the runtime RBBI engine | |
24 | // from the Unicode Sets appearing in the source RBBI rules, and | |
25 | // creates the TRIE table used to map from Unicode to the | |
26 | // character categories. | |
27 | // | |
28 | ||
29 | ||
30 | // | |
31 | // RangeDescriptor | |
32 | // | |
33 | // Each of the non-overlapping character ranges gets one of these descriptors. | |
34 | // All of them are strung together in a linked list, which is kept in order | |
35 | // (by character) | |
36 | // | |
37 | class RangeDescriptor : public UMemory { | |
38 | public: | |
39 | UChar32 fStartChar; // Start of range, unicode 32 bit value. | |
40 | UChar32 fEndChar; // End of range, unicode 32 bit value. | |
41 | int32_t fNum; // runtime-mapped input value for this range. | |
42 | UVector *fIncludesSets; // vector of the the original | |
43 | // Unicode sets that include this range. | |
44 | // (Contains ptrs to uset nodes) | |
45 | RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. | |
46 | ||
47 | RangeDescriptor(UErrorCode &status); | |
48 | RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); | |
49 | ~RangeDescriptor(); | |
50 | void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with | |
51 | // where appearing in the second (higher) part. | |
52 | void setDictionaryFlag(); // Check whether this range appears as part of | |
53 | // the Unicode set named "dictionary" | |
54 | ||
55 | private: | |
56 | RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class | |
57 | RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class | |
58 | }; | |
59 | ||
60 | ||
61 | // | |
62 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. | |
63 | // | |
64 | // Starting with the rules parse tree from the scanner, | |
65 | // | |
66 | // - Enumerate the set of UnicodeSets that are referenced | |
67 | // by the RBBI rules. | |
68 | // - compute a derived set of non-overlapping UnicodeSets | |
69 | // that will correspond to columns in the state table for | |
70 | // the RBBI execution engine. | |
71 | // - construct the trie table that maps input characters | |
72 | // to set numbers in the non-overlapping set of sets. | |
73 | // | |
74 | ||
75 | ||
76 | class RBBISetBuilder : public UMemory { | |
77 | public: | |
78 | RBBISetBuilder(RBBIRuleBuilder *rb); | |
79 | ~RBBISetBuilder(); | |
80 | ||
81 | void build(); | |
73c04bcf A |
82 | void addValToSets(UVector *sets, uint32_t val); |
83 | void addValToSet (RBBINode *usetNode, uint32_t val); | |
374ca955 | 84 | int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the |
73c04bcf A |
85 | // runtime state machine, which are the same as |
86 | // columns in the DFA state table | |
374ca955 | 87 | int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. |
b75a7d8f | 88 | void serializeTrie(uint8_t *where); // write out the serialized Trie. |
374ca955 | 89 | UChar32 getFirstChar(int32_t val) const; |
73c04bcf A |
90 | UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo |
91 | // character were encountered. | |
374ca955 | 92 | #ifdef RBBI_DEBUG |
b75a7d8f A |
93 | void printSets(); |
94 | void printRanges(); | |
95 | void printRangeGroups(); | |
374ca955 A |
96 | #else |
97 | #define printSets() | |
98 | #define printRanges() | |
99 | #define printRangeGroups() | |
100 | #endif | |
b75a7d8f A |
101 | |
102 | private: | |
103 | void numberSets(); | |
104 | ||
105 | RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. | |
106 | UErrorCode *fStatus; | |
107 | ||
108 | RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors | |
109 | ||
110 | UNewTrie *fTrie; // The mapping TRIE that is the end result of processing | |
111 | uint32_t fTrieSize; // the Unicode Sets. | |
112 | ||
113 | // Groups correspond to character categories - | |
114 | // groups of ranges that are in the same original UnicodeSets. | |
115 | // fGroupCount is the index of the last used group. | |
73c04bcf A |
116 | // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. |
117 | // State table column 0 is not used. Column 1 is for end-of-input. | |
118 | // column 2 is for group 0. Funny counting. | |
b75a7d8f A |
119 | int32_t fGroupCount; |
120 | ||
73c04bcf A |
121 | UBool fSawBOF; |
122 | ||
b75a7d8f A |
123 | RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class |
124 | RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class | |
125 | }; | |
126 | ||
127 | ||
128 | ||
129 | U_NAMESPACE_END | |
130 | #endif |