]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbirb.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / common / rbbirb.cpp
1 //
2 // file: rbbirb.cpp
3 //
4 // Copyright (C) 2002-2005, International Business Machines Corporation and others.
5 // All Rights Reserved.
6 //
7 // This file contains the RBBIRuleBuilder class implementation. This is the main class for
8 // building (compiling) break rules into the tables required by the runtime
9 // RBBI engine.
10 //
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_BREAK_ITERATION
15
16 #include "unicode/brkiter.h"
17 #include "unicode/rbbi.h"
18 #include "unicode/ubrk.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "cmemory.h"
26 #include "cstring.h"
27
28 #include "rbbirb.h"
29 #include "rbbinode.h"
30
31 #include "rbbiscan.h"
32 #include "rbbisetb.h"
33 #include "rbbitblb.h"
34 #include "rbbidata.h"
35
36
37 U_NAMESPACE_BEGIN
38
39
40 //----------------------------------------------------------------------------------------
41 //
42 // Constructor.
43 //
44 //----------------------------------------------------------------------------------------
45 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
46 UParseError &parseErr,
47 UErrorCode &status)
48 : fRules(rules)
49 {
50 fStatus = &status; // status is checked below
51 fParseError = &parseErr;
52 fDebugEnv = NULL;
53 #ifdef RBBI_DEBUG
54 fDebugEnv = getenv("U_RBBIDEBUG");
55 #endif
56
57
58 fForwardTree = NULL;
59 fReverseTree = NULL;
60 fSafeFwdTree = NULL;
61 fSafeRevTree = NULL;
62 fDefaultTree = &fForwardTree;
63 fForwardTables = NULL;
64 fReverseTables = NULL;
65 fSafeFwdTables = NULL;
66 fSafeRevTables = NULL;
67 fRuleStatusVals = NULL;
68 fChainRules = FALSE;
69 fLBCMNoChain = FALSE;
70 fLookAheadHardBreak = FALSE;
71 fUSetNodes = NULL;
72 fRuleStatusVals = NULL;
73 fScanner = NULL;
74 fSetBuilder = NULL;
75
76 if (U_FAILURE(status)) {
77 return;
78 }
79
80 fUSetNodes = new UVector(status); // bcos status gets overwritten here
81 fRuleStatusVals = new UVector(status);
82 fScanner = new RBBIRuleScanner(this);
83 fSetBuilder = new RBBISetBuilder(this);
84 if (U_FAILURE(status)) {
85 return;
86 }
87 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
88 status = U_MEMORY_ALLOCATION_ERROR;
89 }
90 }
91
92
93
94 //----------------------------------------------------------------------------------------
95 //
96 // Destructor
97 //
98 //----------------------------------------------------------------------------------------
99 RBBIRuleBuilder::~RBBIRuleBuilder() {
100
101 int i;
102 for (i=0; ; i++) {
103 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
104 if (n==NULL) {
105 break;
106 }
107 delete n;
108 }
109
110 delete fUSetNodes;
111 delete fSetBuilder;
112 delete fForwardTables;
113 delete fReverseTables;
114 delete fSafeFwdTables;
115 delete fSafeRevTables;
116
117 delete fForwardTree;
118 delete fReverseTree;
119 delete fSafeFwdTree;
120 delete fSafeRevTree;
121 delete fScanner;
122 delete fRuleStatusVals;
123 }
124
125
126
127
128
129 //----------------------------------------------------------------------------------------
130 //
131 // flattenData() - Collect up the compiled RBBI rule data and put it into
132 // the format for saving in ICU data files,
133 // which is also the format needed by the RBBI runtime engine.
134 //
135 //----------------------------------------------------------------------------------------
136 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
137
138 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
139 int32_t i;
140
141 if (U_FAILURE(*fStatus)) {
142 return NULL;
143 }
144
145 // Remove comments and whitespace from the rules to make it smaller.
146 UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
147
148 // Calculate the size of each section in the data.
149 // Sizes here are padded up to a multiple of 8 for better memory alignment.
150 // Sections sizes actually stored in the header are for the actual data
151 // without the padding.
152 //
153 int32_t headerSize = align8(sizeof(RBBIDataHeader));
154 int32_t forwardTableSize = align8(fForwardTables->getTableSize());
155 int32_t reverseTableSize = align8(fReverseTables->getTableSize());
156 int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
157 int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
158 int32_t trieSize = align8(fSetBuilder->getTrieSize());
159 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
160 int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
161
162 int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
163 + safeFwdTableSize + safeRevTableSize
164 + statusTableSize + trieSize + rulesSize;
165
166 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
167 if (data == NULL) {
168 *fStatus = U_MEMORY_ALLOCATION_ERROR;
169 return NULL;
170 }
171 uprv_memset(data, 0, totalSize);
172
173
174 data->fMagic = 0xb1a0;
175 data->fFormatVersion[0] = 3;
176 data->fFormatVersion[1] = 1;
177 data->fFormatVersion[2] = 0;
178 data->fFormatVersion[3] = 0;
179 data->fLength = totalSize;
180 data->fCatCount = fSetBuilder->getNumCharCategories();
181
182 data->fFTable = headerSize;
183 data->fFTableLen = forwardTableSize;
184 data->fRTable = data->fFTable + forwardTableSize;
185 data->fRTableLen = reverseTableSize;
186 data->fSFTable = data->fRTable + reverseTableSize;
187 data->fSFTableLen = safeFwdTableSize;
188 data->fSRTable = data->fSFTable + safeFwdTableSize;
189 data->fSRTableLen = safeRevTableSize;
190
191 data->fTrie = data->fSRTable + safeRevTableSize;
192 data->fTrieLen = fSetBuilder->getTrieSize();
193 data->fStatusTable = data->fTrie + trieSize;
194 data->fStatusTableLen= statusTableSize;
195 data->fRuleSource = data->fStatusTable + statusTableSize;
196 data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
197
198 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
199
200 fForwardTables->exportTable((uint8_t *)data + data->fFTable);
201 fReverseTables->exportTable((uint8_t *)data + data->fRTable);
202 fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
203 fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
204 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
205
206 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
207 for (i=0; i<fRuleStatusVals->size(); i++) {
208 ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
209 }
210
211 strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
212
213 return data;
214 }
215
216
217
218
219
220
221 //----------------------------------------------------------------------------------------
222 //
223 // createRuleBasedBreakIterator construct from source rules that are passed in
224 // in a UnicodeString
225 //
226 //----------------------------------------------------------------------------------------
227 BreakIterator *
228 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
229 UParseError &parseError,
230 UErrorCode &status)
231 {
232 // status checked below
233
234 //
235 // Read the input rules, generate a parse tree, symbol table,
236 // and list of all Unicode Sets referenced by the rules.
237 //
238 RBBIRuleBuilder builder(rules, parseError, status);
239 builder.fScanner->parse();
240 if (U_FAILURE(status)) { // status checked here bcos build below doesn't
241 return NULL;
242 }
243
244 //
245 // UnicodeSet processing.
246 // Munge the Unicode Sets to create a set of character categories.
247 // Generate the mapping tables (TRIE) from input 32-bit characters to
248 // the character categories.
249 //
250 builder.fSetBuilder->build();
251
252
253 //
254 // Generate the DFA state transition table.
255 //
256 builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
257 builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
258 builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
259 builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
260 if (U_SUCCESS(status)
261 && (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
262 builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL))
263 {
264 status = U_MEMORY_ALLOCATION_ERROR;
265 return NULL;
266 }
267
268 builder.fForwardTables->build();
269 builder.fReverseTables->build();
270 builder.fSafeFwdTables->build();
271 builder.fSafeRevTables->build();
272 if (U_FAILURE(status)) {
273 return NULL;
274 }
275
276 #ifdef RBBI_DEBUG
277 if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
278 builder.fForwardTables->printRuleStatusTable();
279 }
280 #endif
281
282 //
283 // Package up the compiled data into a memory image
284 // in the run-time format.
285 //
286 RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
287
288
289 //
290 // Clean up the compiler related stuff
291 //
292
293
294 //
295 // Create a break iterator from the compiled rules.
296 // (Identical to creation from stored pre-compiled rules)
297 //
298 // status is checked after init in construction.
299 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
300 if (U_FAILURE(status)) {
301 delete This;
302 This = NULL;
303 }
304 else if(This == NULL) { // test for NULL
305 status = U_MEMORY_ALLOCATION_ERROR;
306 }
307 return This;
308 }
309
310 U_NAMESPACE_END
311
312 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */