]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbisetb.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
6 **********************************************************************
7 * Copyright (c) 2001-2005, International Business Machines
8 * Corporation and others. All Rights Reserved.
9 **********************************************************************
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_BREAK_ITERATION
19 #include "unicode/uobject.h"
27 // RBBISetBuilder Derives the character categories used by the runtime RBBI engine
28 // from the Unicode Sets appearing in the source RBBI rules, and
29 // creates the TRIE table used to map from Unicode to the
30 // character categories.
37 // Each of the non-overlapping character ranges gets one of these descriptors.
38 // All of them are strung together in a linked list, which is kept in order
41 class RangeDescriptor
: public UMemory
{
43 UChar32 fStartChar
; // Start of range, unicode 32 bit value.
44 UChar32 fEndChar
; // End of range, unicode 32 bit value.
45 int32_t fNum
; // runtime-mapped input value for this range.
46 UVector
*fIncludesSets
; // vector of the the original
47 // Unicode sets that include this range.
48 // (Contains ptrs to uset nodes)
49 RangeDescriptor
*fNext
; // Next RangeDescriptor in the linked list.
51 RangeDescriptor(UErrorCode
&status
);
52 RangeDescriptor(const RangeDescriptor
&other
, UErrorCode
&status
);
54 void split(UChar32 where
, UErrorCode
&status
); // Spit this range in two at "where", with
55 // where appearing in the second (higher) part.
56 void setDictionaryFlag(); // Check whether this range appears as part of
57 // the Unicode set named "dictionary"
60 RangeDescriptor(const RangeDescriptor
&other
); // forbid copying of this class
61 RangeDescriptor
&operator=(const RangeDescriptor
&other
); // forbid copying of this class
66 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
68 // Starting with the rules parse tree from the scanner,
70 // - Enumerate the set of UnicodeSets that are referenced
72 // - compute a derived set of non-overlapping UnicodeSets
73 // that will correspond to columns in the state table for
74 // the RBBI execution engine.
75 // - construct the trie table that maps input characters
76 // to set numbers in the non-overlapping set of sets.
80 class RBBISetBuilder
: public UMemory
{
82 RBBISetBuilder(RBBIRuleBuilder
*rb
);
87 void addValToSets(UVector
*sets
, uint32_t val
);
88 void addValToSet (RBBINode
*usetNode
, uint32_t val
);
89 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
90 // runtime state machine, which are the same as
91 // columns in the DFA state table
92 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
93 void serializeTrie(uint8_t *where
); // write out the serialized Trie.
94 UChar32
getFirstChar(int32_t val
) const;
95 UBool
sawBOF() const; // Indicate whether any references to the {bof} pseudo
96 // character were encountered.
98 * Merge two character categories that have been identified as having equivalent behavior.
99 * The ranges belonging to the second category (table column) will be added to the first.
100 * @param categories the pair of categories to be merged.
102 void mergeCategories(IntPair categories
);
104 static constexpr int32_t DICT_BIT
= 0x4000;
109 void printRangeGroups();
112 #define printRanges()
113 #define printRangeGroups()
119 RBBIRuleBuilder
*fRB
; // The RBBI Rule Compiler that owns us.
122 RangeDescriptor
*fRangeList
; // Head of the linked list of RangeDescriptors
124 UTrie2
*fTrie
; // The mapping TRIE that is the end result of processing
125 uint32_t fTrieSize
; // the Unicode Sets.
127 // Groups correspond to character categories -
128 // groups of ranges that are in the same original UnicodeSets.
129 // fGroupCount is the index of the last used group.
130 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
131 // State table column 0 is not used. Column 1 is for end-of-input.
132 // column 2 is for group 0. Funny counting.
137 RBBISetBuilder(const RBBISetBuilder
&other
); // forbid copying of this class
138 RBBISetBuilder
&operator=(const RBBISetBuilder
&other
); // forbid copying of this class
145 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */