]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbisetb57.h
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / rbbisetb57.h
CommitLineData
0f5d89e8
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (c) 2001-2005, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*
9**********************************************************************
10* Legacy version of RangeDescriptor and RBBISetBuilder from ICU 57,
11* only for use by Apple RuleBasedTokenizer
12**********************************************************************
13*/
14
15#ifndef RBBISETB57_H
16#define RBBISETB57_H
17
18#include "unicode/utypes.h"
19#include "unicode/uobject.h"
20#include "rbbirb57.h"
21#include "rbbinode.h"
22#include "uvector.h"
23
24struct UNewTrie;
25
26U_NAMESPACE_BEGIN
27
28class RBBIRuleBuilder57;
29
30//
31// RBBISetBuilder57 Derives the character categories used by the runtime RBBI engine
32// from the Unicode Sets appearing in the source RBBI rules, and
33// creates the TRIE table used to map from Unicode to the
34// character categories.
35//
36
37
38//
39// RangeDescriptor57
40//
41// Each of the non-overlapping character ranges gets one of these descriptors.
42// All of them are strung together in a linked list, which is kept in order
43// (by character)
44//
45class RangeDescriptor57 : public UMemory {
46public:
47 UChar32 fStartChar; // Start of range, unicode 32 bit value.
48 UChar32 fEndChar; // End of range, unicode 32 bit value.
49 int32_t fNum; // runtime-mapped input value for this range.
50 UVector *fIncludesSets; // vector of the the original
51 // Unicode sets that include this range.
52 // (Contains ptrs to uset nodes)
53 RangeDescriptor57 *fNext; // Next RangeDescriptor57 in the linked list.
54
55 RangeDescriptor57(UErrorCode &status);
56 RangeDescriptor57(const RangeDescriptor57 &other, UErrorCode &status);
57 ~RangeDescriptor57();
58 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
59 // where appearing in the second (higher) part.
60 void setDictionaryFlag(); // Check whether this range appears as part of
61 // the Unicode set named "dictionary"
62
63private:
64 RangeDescriptor57(const RangeDescriptor57 &other); // forbid copying of this class
65 RangeDescriptor57 &operator=(const RangeDescriptor57 &other); // forbid copying of this class
66};
67
68
69//
70// RBBISetBuilder57 Handles processing of Unicode Sets from RBBI rules.
71//
72// Starting with the rules parse tree from the scanner,
73//
74// - Enumerate the set of UnicodeSets that are referenced
75// by the RBBI rules.
76// - compute a derived set of non-overlapping UnicodeSets
77// that will correspond to columns in the state table for
78// the RBBI execution engine.
79// - construct the trie table that maps input characters
80// to set numbers in the non-overlapping set of sets.
81//
82
83
84class RBBISetBuilder57 : public UMemory {
85public:
86 RBBISetBuilder57(RBBIRuleBuilder57 *rb);
87 ~RBBISetBuilder57();
88
89 void build();
90 void addValToSets(UVector *sets, uint32_t val);
91 void addValToSet (RBBINode *usetNode, uint32_t val);
92 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
93 // runtime state machine, which are the same as
94 // columns in the DFA state table
95 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
96 void serializeTrie(uint8_t *where); // write out the serialized Trie.
97 UChar32 getFirstChar(int32_t val) const;
98 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
99 // character were encountered.
100#ifdef RBBI_DEBUG
101 void printSets();
102 void printRanges();
103 void printRangeGroups();
104#else
105 #define printSets()
106 #define printRanges()
107 #define printRangeGroups()
108#endif
109
110private:
111 void numberSets();
112
113 RBBIRuleBuilder57 *fRB; // The RBBI Rule Compiler that owns us.
114 UErrorCode *fStatus;
115
116 RangeDescriptor57 *fRangeList; // Head of the linked list of RangeDescriptors
117
118 UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
119 uint32_t fTrieSize; // the Unicode Sets.
120
121 // Groups correspond to character categories -
122 // groups of ranges that are in the same original UnicodeSets.
123 // fGroupCount is the index of the last used group.
124 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
125 // State table column 0 is not used. Column 1 is for end-of-input.
126 // column 2 is for group 0. Funny counting.
127 int32_t fGroupCount;
128
129 UBool fSawBOF;
130
131 RBBISetBuilder57(const RBBISetBuilder57 &other); // forbid copying of this class
132 RBBISetBuilder57 &operator=(const RBBISetBuilder57 &other); // forbid copying of this class
133};
134
135
136
137U_NAMESPACE_END
138#endif