]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbirb.cpp
ICU-59131.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbirb.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3//
4// file: rbbirb.cpp
5//
4388f060 6// Copyright (C) 2002-2011, International Business Machines Corporation and others.
b75a7d8f
A
7// All Rights Reserved.
8//
9// This file contains the RBBIRuleBuilder class implementation. This is the main class for
10// building (compiling) break rules into the tables required by the runtime
11// RBBI engine.
12//
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/brkiter.h"
19#include "unicode/rbbi.h"
20#include "unicode/ubrk.h"
21#include "unicode/unistr.h"
22#include "unicode/uniset.h"
23#include "unicode/uchar.h"
24#include "unicode/uchriter.h"
25#include "unicode/parsepos.h"
26#include "unicode/parseerr.h"
27#include "cmemory.h"
28#include "cstring.h"
29
30#include "rbbirb.h"
31#include "rbbinode.h"
32
33#include "rbbiscan.h"
34#include "rbbisetb.h"
35#include "rbbitblb.h"
36#include "rbbidata.h"
37
38
39U_NAMESPACE_BEGIN
40
41
42//----------------------------------------------------------------------------------------
43//
44// Constructor.
45//
46//----------------------------------------------------------------------------------------
47RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
46f4442e 48 UParseError *parseErr,
b75a7d8f
A
49 UErrorCode &status)
50 : fRules(rules)
51{
374ca955 52 fStatus = &status; // status is checked below
46f4442e 53 fParseError = parseErr;
b75a7d8f
A
54 fDebugEnv = NULL;
55#ifdef RBBI_DEBUG
56 fDebugEnv = getenv("U_RBBIDEBUG");
57#endif
58
59
60 fForwardTree = NULL;
61 fReverseTree = NULL;
374ca955
A
62 fSafeFwdTree = NULL;
63 fSafeRevTree = NULL;
64 fDefaultTree = &fForwardTree;
b75a7d8f
A
65 fForwardTables = NULL;
66 fReverseTables = NULL;
374ca955
A
67 fSafeFwdTables = NULL;
68 fSafeRevTables = NULL;
69 fRuleStatusVals = NULL;
70 fChainRules = FALSE;
71 fLBCMNoChain = FALSE;
72 fLookAheadHardBreak = FALSE;
73 fUSetNodes = NULL;
74 fRuleStatusVals = NULL;
75 fScanner = NULL;
76 fSetBuilder = NULL;
46f4442e
A
77 if (parseErr) {
78 uprv_memset(parseErr, 0, sizeof(UParseError));
79 }
374ca955
A
80
81 if (U_FAILURE(status)) {
82 return;
83 }
84
85 fUSetNodes = new UVector(status); // bcos status gets overwritten here
86 fRuleStatusVals = new UVector(status);
b75a7d8f
A
87 fScanner = new RBBIRuleScanner(this);
88 fSetBuilder = new RBBISetBuilder(this);
374ca955
A
89 if (U_FAILURE(status)) {
90 return;
91 }
92 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
b75a7d8f
A
93 status = U_MEMORY_ALLOCATION_ERROR;
94 }
95}
96
97
98
99//----------------------------------------------------------------------------------------
100//
101// Destructor
102//
103//----------------------------------------------------------------------------------------
104RBBIRuleBuilder::~RBBIRuleBuilder() {
105
106 int i;
107 for (i=0; ; i++) {
108 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
109 if (n==NULL) {
110 break;
111 }
112 delete n;
113 }
114
115 delete fUSetNodes;
116 delete fSetBuilder;
117 delete fForwardTables;
118 delete fReverseTables;
374ca955
A
119 delete fSafeFwdTables;
120 delete fSafeRevTables;
121
b75a7d8f
A
122 delete fForwardTree;
123 delete fReverseTree;
374ca955
A
124 delete fSafeFwdTree;
125 delete fSafeRevTree;
b75a7d8f 126 delete fScanner;
374ca955 127 delete fRuleStatusVals;
b75a7d8f
A
128}
129
130
131
132
133
134//----------------------------------------------------------------------------------------
135//
136// flattenData() - Collect up the compiled RBBI rule data and put it into
137// the format for saving in ICU data files,
138// which is also the format needed by the RBBI runtime engine.
139//
140//----------------------------------------------------------------------------------------
141static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
142
143RBBIDataHeader *RBBIRuleBuilder::flattenData() {
374ca955
A
144 int32_t i;
145
b75a7d8f
A
146 if (U_FAILURE(*fStatus)) {
147 return NULL;
148 }
149
150 // Remove comments and whitespace from the rules to make it smaller.
73c04bcf 151 UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
b75a7d8f
A
152
153 // Calculate the size of each section in the data.
154 // Sizes here are padded up to a multiple of 8 for better memory alignment.
155 // Sections sizes actually stored in the header are for the actual data
156 // without the padding.
157 //
158 int32_t headerSize = align8(sizeof(RBBIDataHeader));
159 int32_t forwardTableSize = align8(fForwardTables->getTableSize());
160 int32_t reverseTableSize = align8(fReverseTables->getTableSize());
374ca955
A
161 int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
162 int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
b75a7d8f 163 int32_t trieSize = align8(fSetBuilder->getTrieSize());
374ca955 164 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
b75a7d8f
A
165 int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
166
167 int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
374ca955
A
168 + safeFwdTableSize + safeRevTableSize
169 + statusTableSize + trieSize + rulesSize;
170
b75a7d8f
A
171 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
172 if (data == NULL) {
173 *fStatus = U_MEMORY_ALLOCATION_ERROR;
174 return NULL;
175 }
176 uprv_memset(data, 0, totalSize);
177
178
73c04bcf
A
179 data->fMagic = 0xb1a0;
180 data->fFormatVersion[0] = 3;
181 data->fFormatVersion[1] = 1;
182 data->fFormatVersion[2] = 0;
183 data->fFormatVersion[3] = 0;
184 data->fLength = totalSize;
185 data->fCatCount = fSetBuilder->getNumCharCategories();
b75a7d8f
A
186
187 data->fFTable = headerSize;
188 data->fFTableLen = forwardTableSize;
374ca955 189 data->fRTable = data->fFTable + forwardTableSize;
b75a7d8f 190 data->fRTableLen = reverseTableSize;
374ca955
A
191 data->fSFTable = data->fRTable + reverseTableSize;
192 data->fSFTableLen = safeFwdTableSize;
193 data->fSRTable = data->fSFTable + safeFwdTableSize;
194 data->fSRTableLen = safeRevTableSize;
195
196 data->fTrie = data->fSRTable + safeRevTableSize;
b75a7d8f 197 data->fTrieLen = fSetBuilder->getTrieSize();
374ca955
A
198 data->fStatusTable = data->fTrie + trieSize;
199 data->fStatusTableLen= statusTableSize;
200 data->fRuleSource = data->fStatusTable + statusTableSize;
b75a7d8f
A
201 data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
202
203 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
204
205 fForwardTables->exportTable((uint8_t *)data + data->fFTable);
206 fReverseTables->exportTable((uint8_t *)data + data->fRTable);
374ca955
A
207 fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
208 fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
b75a7d8f 209 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
374ca955
A
210
211 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
212 for (i=0; i<fRuleStatusVals->size(); i++) {
213 ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
214 }
215
b75a7d8f
A
216 strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
217
218 return data;
219}
220
221
222
223
224
225
226//----------------------------------------------------------------------------------------
227//
228// createRuleBasedBreakIterator construct from source rules that are passed in
229// in a UnicodeString
230//
231//----------------------------------------------------------------------------------------
232BreakIterator *
233RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
46f4442e 234 UParseError *parseError,
b75a7d8f
A
235 UErrorCode &status)
236{
374ca955 237 // status checked below
b75a7d8f
A
238
239 //
240 // Read the input rules, generate a parse tree, symbol table,
241 // and list of all Unicode Sets referenced by the rules.
242 //
243 RBBIRuleBuilder builder(rules, parseError, status);
374ca955 244 if (U_FAILURE(status)) { // status checked here bcos build below doesn't
b75a7d8f
A
245 return NULL;
246 }
46f4442e 247 builder.fScanner->parse();
b75a7d8f
A
248
249 //
250 // UnicodeSet processing.
251 // Munge the Unicode Sets to create a set of character categories.
252 // Generate the mapping tables (TRIE) from input 32-bit characters to
253 // the character categories.
254 //
255 builder.fSetBuilder->build();
256
257
258 //
259 // Generate the DFA state transition table.
260 //
261 builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
262 builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
374ca955
A
263 builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
264 builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
4388f060
A
265 if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
266 builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
374ca955 267 {
b75a7d8f 268 status = U_MEMORY_ALLOCATION_ERROR;
4388f060
A
269 delete builder.fForwardTables; builder.fForwardTables = NULL;
270 delete builder.fReverseTables; builder.fReverseTables = NULL;
271 delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
272 delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
b75a7d8f
A
273 return NULL;
274 }
275
276 builder.fForwardTables->build();
277 builder.fReverseTables->build();
374ca955
A
278 builder.fSafeFwdTables->build();
279 builder.fSafeRevTables->build();
b75a7d8f 280
374ca955
A
281#ifdef RBBI_DEBUG
282 if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
283 builder.fForwardTables->printRuleStatusTable();
284 }
285#endif
b75a7d8f
A
286
287 //
288 // Package up the compiled data into a memory image
289 // in the run-time format.
290 //
374ca955 291 RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
46f4442e
A
292 if (U_FAILURE(*builder.fStatus)) {
293 return NULL;
294 }
b75a7d8f
A
295
296
297 //
298 // Clean up the compiler related stuff
299 //
300
301
302 //
303 // Create a break iterator from the compiled rules.
304 // (Identical to creation from stored pre-compiled rules)
305 //
374ca955 306 // status is checked after init in construction.
b75a7d8f 307 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
b75a7d8f
A
308 if (U_FAILURE(status)) {
309 delete This;
310 This = NULL;
374ca955
A
311 }
312 else if(This == NULL) { // test for NULL
313 status = U_MEMORY_ALLOCATION_ERROR;
b75a7d8f
A
314 }
315 return This;
316}
317
318U_NAMESPACE_END
319
320#endif /* #if !UCONFIG_NO_BREAK_ITERATION */