]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbisetb.h
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbisetb.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // rbbisetb.h
5 /*
6 **********************************************************************
7 * Copyright (c) 2001-2005, International Business Machines
8 * Corporation and others. All Rights Reserved.
9 **********************************************************************
10 */
11
12 #ifndef RBBISETB_H
13 #define RBBISETB_H
14
15 #include "unicode/utypes.h"
16 #include "unicode/uobject.h"
17 #include "rbbirb.h"
18 #include "uvector.h"
19
20 struct UNewTrie;
21
22 U_NAMESPACE_BEGIN
23
24 //
25 // RBBISetBuilder Derives the character categories used by the runtime RBBI engine
26 // from the Unicode Sets appearing in the source RBBI rules, and
27 // creates the TRIE table used to map from Unicode to the
28 // character categories.
29 //
30
31
32 //
33 // RangeDescriptor
34 //
35 // Each of the non-overlapping character ranges gets one of these descriptors.
36 // All of them are strung together in a linked list, which is kept in order
37 // (by character)
38 //
39 class RangeDescriptor : public UMemory {
40 public:
41 UChar32 fStartChar; // Start of range, unicode 32 bit value.
42 UChar32 fEndChar; // End of range, unicode 32 bit value.
43 int32_t fNum; // runtime-mapped input value for this range.
44 UVector *fIncludesSets; // vector of the the original
45 // Unicode sets that include this range.
46 // (Contains ptrs to uset nodes)
47 RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
48
49 RangeDescriptor(UErrorCode &status);
50 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
51 ~RangeDescriptor();
52 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
53 // where appearing in the second (higher) part.
54 void setDictionaryFlag(); // Check whether this range appears as part of
55 // the Unicode set named "dictionary"
56
57 private:
58 RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
59 RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
60 };
61
62
63 //
64 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
65 //
66 // Starting with the rules parse tree from the scanner,
67 //
68 // - Enumerate the set of UnicodeSets that are referenced
69 // by the RBBI rules.
70 // - compute a derived set of non-overlapping UnicodeSets
71 // that will correspond to columns in the state table for
72 // the RBBI execution engine.
73 // - construct the trie table that maps input characters
74 // to set numbers in the non-overlapping set of sets.
75 //
76
77
78 class RBBISetBuilder : public UMemory {
79 public:
80 RBBISetBuilder(RBBIRuleBuilder *rb);
81 ~RBBISetBuilder();
82
83 void build();
84 void addValToSets(UVector *sets, uint32_t val);
85 void addValToSet (RBBINode *usetNode, uint32_t val);
86 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
87 // runtime state machine, which are the same as
88 // columns in the DFA state table
89 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
90 void serializeTrie(uint8_t *where); // write out the serialized Trie.
91 UChar32 getFirstChar(int32_t val) const;
92 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
93 // character were encountered.
94 #ifdef RBBI_DEBUG
95 void printSets();
96 void printRanges();
97 void printRangeGroups();
98 #else
99 #define printSets()
100 #define printRanges()
101 #define printRangeGroups()
102 #endif
103
104 private:
105 void numberSets();
106
107 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
108 UErrorCode *fStatus;
109
110 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
111
112 UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
113 uint32_t fTrieSize; // the Unicode Sets.
114
115 // Groups correspond to character categories -
116 // groups of ranges that are in the same original UnicodeSets.
117 // fGroupCount is the index of the last used group.
118 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
119 // State table column 0 is not used. Column 1 is for end-of-input.
120 // column 2 is for group 0. Funny counting.
121 int32_t fGroupCount;
122
123 UBool fSawBOF;
124
125 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
126 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
127 };
128
129
130
131 U_NAMESPACE_END
132 #endif