1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 // Copyright (C) 2002-2011, International Business Machines Corporation and others.
5 // All Rights Reserved.
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
12 #include "unicode/brkiter.h"
13 #include "unicode/ubrk.h"
14 #include "unicode/unistr.h"
15 #include "unicode/uniset.h"
16 #include "unicode/uchar.h"
17 #include "unicode/uchriter.h"
18 #include "unicode/parsepos.h"
19 #include "unicode/parseerr.h"
23 #include "rbbidata57.h"
25 #include "rbbisetb57.h"
28 #include "rbbiscan57.h"
29 #include "rbbitblb57.h"
35 //----------------------------------------------------------------------------------------
39 //----------------------------------------------------------------------------------------
40 RBBIRuleBuilder57::RBBIRuleBuilder57(const UnicodeString
&rules
,
41 UParseError
*parseErr
,
45 fStatus
= &status
; // status is checked below
46 fParseError
= parseErr
;
49 fDebugEnv
= getenv("U_RBBIDEBUG");
57 fDefaultTree
= &fForwardTree
;
58 fForwardTables
= NULL
;
59 fReverseTables
= NULL
;
60 fSafeFwdTables
= NULL
;
61 fSafeRevTables
= NULL
;
62 fRuleStatusVals
= NULL
;
65 fLookAheadHardBreak
= FALSE
;
68 fRuleStatusVals
= NULL
;
72 uprv_memset(parseErr
, 0, sizeof(UParseError
));
75 if (U_FAILURE(status
)) {
79 fUSetNodes
= new UVector(status
); // bcos status gets overwritten here
80 fRuleStatusVals
= new UVector(status
);
81 fScanner
= new RBBIRuleScanner57(this);
82 fSetBuilder
= new RBBISetBuilder57(this);
83 if (U_FAILURE(status
)) {
86 if(fSetBuilder
== 0 || fScanner
== 0 || fUSetNodes
== 0 || fRuleStatusVals
== 0) {
87 status
= U_MEMORY_ALLOCATION_ERROR
;
93 //----------------------------------------------------------------------------------------
97 //----------------------------------------------------------------------------------------
98 RBBIRuleBuilder57::~RBBIRuleBuilder57() {
102 RBBINode
*n
= (RBBINode
*)fUSetNodes
->elementAt(i
);
111 delete fForwardTables
;
112 delete fReverseTables
;
113 delete fSafeFwdTables
;
114 delete fSafeRevTables
;
121 delete fRuleStatusVals
;
128 //----------------------------------------------------------------------------------------
130 // flattenData() - Collect up the compiled RBBI rule data and put it into
131 // the format for saving in ICU data files,
132 // which is also the format needed by the RBBI runtime engine.
134 //----------------------------------------------------------------------------------------
135 static int32_t align8(int32_t i
) {return (i
+7) & 0xfffffff8;}
137 RBBIDataHeader57
*RBBIRuleBuilder57::flattenData() {
140 if (U_FAILURE(*fStatus
)) {
144 // Remove comments and whitespace from the rules to make it smaller.
145 UnicodeString
strippedRules((const UnicodeString
&)RBBIRuleScanner57::stripRules(fRules
));
147 // Calculate the size of each section in the data.
148 // Sizes here are padded up to a multiple of 8 for better memory alignment.
149 // Sections sizes actually stored in the header are for the actual data
150 // without the padding.
152 int32_t headerSize
= align8(sizeof(RBBIDataHeader57
));
153 int32_t forwardTableSize
= align8(fForwardTables
->getTableSize());
154 int32_t reverseTableSize
= align8(fReverseTables
->getTableSize());
155 int32_t safeFwdTableSize
= align8(fSafeFwdTables
->getTableSize());
156 int32_t safeRevTableSize
= align8(fSafeRevTables
->getTableSize());
157 int32_t trieSize
= align8(fSetBuilder
->getTrieSize());
158 int32_t statusTableSize
= align8(fRuleStatusVals
->size() * sizeof(int32_t));
159 int32_t rulesSize
= align8((strippedRules
.length()+1) * sizeof(UChar
));
161 int32_t totalSize
= headerSize
+ forwardTableSize
+ reverseTableSize
162 + safeFwdTableSize
+ safeRevTableSize
163 + statusTableSize
+ trieSize
+ rulesSize
;
165 RBBIDataHeader57
*data
= (RBBIDataHeader57
*)uprv_malloc(totalSize
);
167 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
170 uprv_memset(data
, 0, totalSize
);
173 data
->fMagic
= 0xb1a0;
174 data
->fFormatVersion
[0] = 3;
175 data
->fFormatVersion
[1] = 1;
176 data
->fFormatVersion
[2] = 0;
177 data
->fFormatVersion
[3] = 0;
178 data
->fLength
= totalSize
;
179 data
->fCatCount
= fSetBuilder
->getNumCharCategories();
181 data
->fFTable
= headerSize
;
182 data
->fFTableLen
= forwardTableSize
;
183 data
->fRTable
= data
->fFTable
+ forwardTableSize
;
184 data
->fRTableLen
= reverseTableSize
;
185 data
->fSFTable
= data
->fRTable
+ reverseTableSize
;
186 data
->fSFTableLen
= safeFwdTableSize
;
187 data
->fSRTable
= data
->fSFTable
+ safeFwdTableSize
;
188 data
->fSRTableLen
= safeRevTableSize
;
190 data
->fTrie
= data
->fSRTable
+ safeRevTableSize
;
191 data
->fTrieLen
= fSetBuilder
->getTrieSize();
192 data
->fStatusTable
= data
->fTrie
+ trieSize
;
193 data
->fStatusTableLen
= statusTableSize
;
194 data
->fRuleSource
= data
->fStatusTable
+ statusTableSize
;
195 data
->fRuleSourceLen
= strippedRules
.length() * sizeof(UChar
);
197 uprv_memset(data
->fReserved
, 0, sizeof(data
->fReserved
));
199 fForwardTables
->exportTable((uint8_t *)data
+ data
->fFTable
);
200 fReverseTables
->exportTable((uint8_t *)data
+ data
->fRTable
);
201 fSafeFwdTables
->exportTable((uint8_t *)data
+ data
->fSFTable
);
202 fSafeRevTables
->exportTable((uint8_t *)data
+ data
->fSRTable
);
203 fSetBuilder
->serializeTrie ((uint8_t *)data
+ data
->fTrie
);
205 int32_t *ruleStatusTable
= (int32_t *)((uint8_t *)data
+ data
->fStatusTable
);
206 for (i
=0; i
<fRuleStatusVals
->size(); i
++) {
207 ruleStatusTable
[i
] = fRuleStatusVals
->elementAti(i
);
210 strippedRules
.extract((UChar
*)((uint8_t *)data
+data
->fRuleSource
), rulesSize
/2+1, *fStatus
);
220 //----------------------------------------------------------------------------------------
222 // createRuleBasedBreakIterator construct from source rules that are passed in
223 // in a UnicodeString
225 //----------------------------------------------------------------------------------------
227 RBBIRuleBuilder57::createRuleBasedBreakIterator( const UnicodeString
&rules
,
228 UParseError
*parseError
,
231 // status checked below
234 // Read the input rules, generate a parse tree, symbol table,
235 // and list of all Unicode Sets referenced by the rules.
237 RBBIRuleBuilder57
builder(rules
, parseError
, status
);
238 if (U_FAILURE(status
)) { // status checked here bcos build below doesn't
241 builder
.fScanner
->parse();
244 // UnicodeSet processing.
245 // Munge the Unicode Sets to create a set of character categories.
246 // Generate the mapping tables (TRIE) from input 32-bit characters to
247 // the character categories.
249 builder
.fSetBuilder
->build();
253 // Generate the DFA state transition table.
255 builder
.fForwardTables
= new RBBITableBuilder57(&builder
, &builder
.fForwardTree
);
256 builder
.fReverseTables
= new RBBITableBuilder57(&builder
, &builder
.fReverseTree
);
257 builder
.fSafeFwdTables
= new RBBITableBuilder57(&builder
, &builder
.fSafeFwdTree
);
258 builder
.fSafeRevTables
= new RBBITableBuilder57(&builder
, &builder
.fSafeRevTree
);
259 if (builder
.fForwardTables
== NULL
|| builder
.fReverseTables
== NULL
||
260 builder
.fSafeFwdTables
== NULL
|| builder
.fSafeRevTables
== NULL
)
262 status
= U_MEMORY_ALLOCATION_ERROR
;
263 delete builder
.fForwardTables
; builder
.fForwardTables
= NULL
;
264 delete builder
.fReverseTables
; builder
.fReverseTables
= NULL
;
265 delete builder
.fSafeFwdTables
; builder
.fSafeFwdTables
= NULL
;
266 delete builder
.fSafeRevTables
; builder
.fSafeRevTables
= NULL
;
270 builder
.fForwardTables
->build();
271 builder
.fReverseTables
->build();
272 builder
.fSafeFwdTables
->build();
273 builder
.fSafeRevTables
->build();
276 if (builder
.fDebugEnv
&& uprv_strstr(builder
.fDebugEnv
, "states")) {
277 builder
.fForwardTables
->printRuleStatusTable();
282 // Package up the compiled data into a memory image
283 // in the run-time format.
285 RBBIDataHeader57
*data
= builder
.flattenData(); // returns NULL if error
286 if (U_FAILURE(*builder
.fStatus
)) {
292 // Clean up the compiler related stuff
297 // Create a break iterator from the compiled rules.
298 // (Identical to creation from stored pre-compiled rules)
300 // status is checked after init in construction.
301 RuleBasedBreakIterator57
*This
= new RuleBasedBreakIterator57(data
, status
);
302 if (U_FAILURE(status
)) {
306 else if(This
== NULL
) { // test for NULL
307 status
= U_MEMORY_ALLOCATION_ERROR
;
314 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */