]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbisetb.h
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / rbbisetb.h
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3//
4// rbbisetb.h
5/*
6**********************************************************************
73c04bcf 7* Copyright (c) 2001-2005, International Business Machines
b75a7d8f
A
8* Corporation and others. All Rights Reserved.
9**********************************************************************
10*/
11
12#ifndef RBBISETB_H
13#define RBBISETB_H
14
15#include "unicode/utypes.h"
0f5d89e8
A
16
17#if !UCONFIG_NO_BREAK_ITERATION
18
b75a7d8f
A
19#include "unicode/uobject.h"
20#include "rbbirb.h"
0f5d89e8 21#include "utrie2.h"
b75a7d8f 22#include "uvector.h"
b75a7d8f 23
b75a7d8f
A
24U_NAMESPACE_BEGIN
25
26//
27// RBBISetBuilder Derives the character categories used by the runtime RBBI engine
28// from the Unicode Sets appearing in the source RBBI rules, and
29// creates the TRIE table used to map from Unicode to the
30// character categories.
31//
32
33
34//
35// RangeDescriptor
36//
37// Each of the non-overlapping character ranges gets one of these descriptors.
38// All of them are strung together in a linked list, which is kept in order
39// (by character)
40//
41class RangeDescriptor : public UMemory {
42public:
43 UChar32 fStartChar; // Start of range, unicode 32 bit value.
44 UChar32 fEndChar; // End of range, unicode 32 bit value.
45 int32_t fNum; // runtime-mapped input value for this range.
46 UVector *fIncludesSets; // vector of the the original
47 // Unicode sets that include this range.
48 // (Contains ptrs to uset nodes)
49 RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
50
51 RangeDescriptor(UErrorCode &status);
52 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
53 ~RangeDescriptor();
54 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
55 // where appearing in the second (higher) part.
56 void setDictionaryFlag(); // Check whether this range appears as part of
57 // the Unicode set named "dictionary"
58
59private:
60 RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
61 RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
62};
63
64
65//
66// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
67//
68// Starting with the rules parse tree from the scanner,
69//
70// - Enumerate the set of UnicodeSets that are referenced
71// by the RBBI rules.
72// - compute a derived set of non-overlapping UnicodeSets
73// that will correspond to columns in the state table for
74// the RBBI execution engine.
75// - construct the trie table that maps input characters
76// to set numbers in the non-overlapping set of sets.
77//
78
79
80class RBBISetBuilder : public UMemory {
81public:
82 RBBISetBuilder(RBBIRuleBuilder *rb);
83 ~RBBISetBuilder();
84
0f5d89e8
A
85 void buildRanges();
86 void buildTrie();
73c04bcf
A
87 void addValToSets(UVector *sets, uint32_t val);
88 void addValToSet (RBBINode *usetNode, uint32_t val);
374ca955 89 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
73c04bcf
A
90 // runtime state machine, which are the same as
91 // columns in the DFA state table
374ca955 92 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
b75a7d8f 93 void serializeTrie(uint8_t *where); // write out the serialized Trie.
374ca955 94 UChar32 getFirstChar(int32_t val) const;
73c04bcf
A
95 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
96 // character were encountered.
0f5d89e8
A
97 /**
98 * Merge two character categories that have been identified as having equivalent behavior.
99 * The ranges belonging to the second category (table column) will be added to the first.
100 * @param categories the pair of categories to be merged.
101 */
102 void mergeCategories(IntPair categories);
103
104 static constexpr int32_t DICT_BIT = 0x4000;
105
374ca955 106#ifdef RBBI_DEBUG
b75a7d8f
A
107 void printSets();
108 void printRanges();
109 void printRangeGroups();
374ca955
A
110#else
111 #define printSets()
112 #define printRanges()
113 #define printRangeGroups()
114#endif
b75a7d8f
A
115
116private:
117 void numberSets();
118
119 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
120 UErrorCode *fStatus;
121
122 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
123
0f5d89e8
A
124 UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
125 uint32_t fTrieSize; // the Unicode Sets.
b75a7d8f
A
126
127 // Groups correspond to character categories -
128 // groups of ranges that are in the same original UnicodeSets.
129 // fGroupCount is the index of the last used group.
73c04bcf
A
130 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
131 // State table column 0 is not used. Column 1 is for end-of-input.
132 // column 2 is for group 0. Funny counting.
b75a7d8f
A
133 int32_t fGroupCount;
134
73c04bcf
A
135 UBool fSawBOF;
136
b75a7d8f
A
137 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
138 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
139};
140
141
142
143U_NAMESPACE_END
0f5d89e8
A
144
145#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
146
b75a7d8f 147#endif