1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
6 // Copyright (C) 2002-2011, International Business Machines Corporation and others.
7 // All Rights Reserved.
9 // This file contains the RBBIRuleBuilder class implementation. This is the main class for
10 // building (compiling) break rules into the tables required by the runtime
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_BREAK_ITERATION
18 #include "unicode/brkiter.h"
19 #include "unicode/rbbi.h"
20 #include "unicode/ubrk.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uchriter.h"
25 #include "unicode/parsepos.h"
26 #include "unicode/parseerr.h"
42 //----------------------------------------------------------------------------------------
46 //----------------------------------------------------------------------------------------
47 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString
&rules
,
48 UParseError
*parseErr
,
50 : fRules(rules
), fStrippedRules(rules
)
52 fStatus
= &status
; // status is checked below
53 fParseError
= parseErr
;
56 fDebugEnv
= getenv("U_RBBIDEBUG");
64 fDefaultTree
= &fForwardTree
;
66 fRuleStatusVals
= NULL
;
69 fLookAheadHardBreak
= FALSE
;
71 fRuleStatusVals
= NULL
;
75 uprv_memset(parseErr
, 0, sizeof(UParseError
));
78 if (U_FAILURE(status
)) {
82 fUSetNodes
= new UVector(status
); // bcos status gets overwritten here
83 fRuleStatusVals
= new UVector(status
);
84 fScanner
= new RBBIRuleScanner(this);
85 fSetBuilder
= new RBBISetBuilder(this);
86 if (U_FAILURE(status
)) {
89 if(fSetBuilder
== 0 || fScanner
== 0 || fUSetNodes
== 0 || fRuleStatusVals
== 0) {
90 status
= U_MEMORY_ALLOCATION_ERROR
;
96 //----------------------------------------------------------------------------------------
100 //----------------------------------------------------------------------------------------
101 RBBIRuleBuilder::~RBBIRuleBuilder() {
105 RBBINode
*n
= (RBBINode
*)fUSetNodes
->elementAt(i
);
114 delete fForwardTable
;
120 delete fRuleStatusVals
;
127 //----------------------------------------------------------------------------------------
129 // flattenData() - Collect up the compiled RBBI rule data and put it into
130 // the format for saving in ICU data files,
131 // which is also the format needed by the RBBI runtime engine.
133 //----------------------------------------------------------------------------------------
134 static int32_t align8(int32_t i
) {return (i
+7) & 0xfffffff8;}
136 RBBIDataHeader
*RBBIRuleBuilder::flattenData() {
139 if (U_FAILURE(*fStatus
)) {
143 // Remove whitespace from the rules to make it smaller.
144 // The rule parser has already removed comments.
145 fStrippedRules
= fScanner
->stripRules(fStrippedRules
);
147 // Calculate the size of each section in the data.
148 // Sizes here are padded up to a multiple of 8 for better memory alignment.
149 // Sections sizes actually stored in the header are for the actual data
150 // without the padding.
152 int32_t headerSize
= align8(sizeof(RBBIDataHeader
));
153 int32_t forwardTableSize
= align8(fForwardTable
->getTableSize());
154 int32_t reverseTableSize
= align8(fForwardTable
->getSafeTableSize());
155 int32_t trieSize
= align8(fSetBuilder
->getTrieSize());
156 int32_t statusTableSize
= align8(fRuleStatusVals
->size() * sizeof(int32_t));
157 int32_t rulesSize
= align8((fStrippedRules
.length()+1) * sizeof(UChar
));
159 int32_t totalSize
= headerSize
162 + statusTableSize
+ trieSize
+ rulesSize
;
164 RBBIDataHeader
*data
= (RBBIDataHeader
*)uprv_malloc(totalSize
);
166 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
169 uprv_memset(data
, 0, totalSize
);
172 data
->fMagic
= 0xb1a0;
173 data
->fFormatVersion
[0] = RBBI_DATA_FORMAT_VERSION
[0];
174 data
->fFormatVersion
[1] = RBBI_DATA_FORMAT_VERSION
[1];
175 data
->fFormatVersion
[2] = RBBI_DATA_FORMAT_VERSION
[2];
176 data
->fFormatVersion
[3] = RBBI_DATA_FORMAT_VERSION
[3];
177 data
->fLength
= totalSize
;
178 data
->fCatCount
= fSetBuilder
->getNumCharCategories();
180 data
->fFTable
= headerSize
;
181 data
->fFTableLen
= forwardTableSize
;
183 data
->fRTable
= data
->fFTable
+ data
->fFTableLen
;
184 data
->fRTableLen
= reverseTableSize
;
186 data
->fTrie
= data
->fRTable
+ data
->fRTableLen
;
187 data
->fTrieLen
= fSetBuilder
->getTrieSize();
188 data
->fStatusTable
= data
->fTrie
+ trieSize
;
189 data
->fStatusTableLen
= statusTableSize
;
190 data
->fRuleSource
= data
->fStatusTable
+ statusTableSize
;
191 data
->fRuleSourceLen
= fStrippedRules
.length() * sizeof(UChar
);
193 uprv_memset(data
->fReserved
, 0, sizeof(data
->fReserved
));
195 fForwardTable
->exportTable((uint8_t *)data
+ data
->fFTable
);
196 fForwardTable
->exportSafeTable((uint8_t *)data
+ data
->fRTable
);
197 fSetBuilder
->serializeTrie ((uint8_t *)data
+ data
->fTrie
);
199 int32_t *ruleStatusTable
= (int32_t *)((uint8_t *)data
+ data
->fStatusTable
);
200 for (i
=0; i
<fRuleStatusVals
->size(); i
++) {
201 ruleStatusTable
[i
] = fRuleStatusVals
->elementAti(i
);
204 fStrippedRules
.extract((UChar
*)((uint8_t *)data
+data
->fRuleSource
), rulesSize
/2+1, *fStatus
);
210 //----------------------------------------------------------------------------------------
212 // createRuleBasedBreakIterator construct from source rules that are passed in
213 // in a UnicodeString
215 //----------------------------------------------------------------------------------------
217 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString
&rules
,
218 UParseError
*parseError
,
222 // Read the input rules, generate a parse tree, symbol table,
223 // and list of all Unicode Sets referenced by the rules.
225 RBBIRuleBuilder
builder(rules
, parseError
, status
);
226 if (U_FAILURE(status
)) { // status checked here bcos build below doesn't
230 RBBIDataHeader
*data
= builder
.build(status
);
232 if (U_FAILURE(status
)) {
237 // Create a break iterator from the compiled rules.
238 // (Identical to creation from stored pre-compiled rules)
240 // status is checked after init in construction.
241 RuleBasedBreakIterator
*This
= new RuleBasedBreakIterator(data
, status
);
242 if (U_FAILURE(status
)) {
246 else if(This
== NULL
) { // test for NULL
247 status
= U_MEMORY_ALLOCATION_ERROR
;
252 RBBIDataHeader
*RBBIRuleBuilder::build(UErrorCode
&status
) {
253 if (U_FAILURE(status
)) {
258 if (U_FAILURE(status
)) {
263 // UnicodeSet processing.
264 // Munge the Unicode Sets to create a set of character categories.
265 // Generate the mapping tables (TRIE) from input code points to
266 // the character categories.
268 fSetBuilder
->buildRanges();
271 // Generate the DFA state transition table.
273 fForwardTable
= new RBBITableBuilder(this, &fForwardTree
, status
);
274 if (fForwardTable
== nullptr) {
275 status
= U_MEMORY_ALLOCATION_ERROR
;
279 fForwardTable
->buildForwardTable();
281 fForwardTable
->buildSafeReverseTable(status
);
285 if (fDebugEnv
&& uprv_strstr(fDebugEnv
, "states")) {
286 fForwardTable
->printStates();
287 fForwardTable
->printRuleStatusTable();
288 fForwardTable
->printReverseTable();
292 fSetBuilder
->buildTrie();
295 // Package up the compiled data into a memory image
296 // in the run-time format.
298 RBBIDataHeader
*data
= flattenData(); // returns NULL if error
299 if (U_FAILURE(status
)) {
305 void RBBIRuleBuilder::optimizeTables() {
308 didSomething
= false;
310 // Begin looking for duplicates with char class 3.
311 // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
312 // and should not have other categories merged into them.
313 IntPair duplPair
= {3, 0};
314 while (fForwardTable
->findDuplCharClassFrom(&duplPair
)) {
315 fSetBuilder
->mergeCategories(duplPair
);
316 fForwardTable
->removeColumn(duplPair
.second
);
320 while (fForwardTable
->removeDuplicateStates() > 0) {
323 } while (didSomething
);
328 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */