]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbirb.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / rbbirb.cpp
1 //
2 // file: rbbirb.cpp
3 //
4 // Copyright (C) 2002-2003, International Business Machines Corporation and others.
5 // All Rights Reserved.
6 //
7 // This file contains the RBBIRuleBuilder class implementation. This is the main class for
8 // building (compiling) break rules into the tables required by the runtime
9 // RBBI engine.
10 //
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_BREAK_ITERATION
15
16 #include "unicode/brkiter.h"
17 #include "unicode/rbbi.h"
18 #include "unicode/ubrk.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "cmemory.h"
26 #include "cstring.h"
27
28 #include "rbbirb.h"
29 #include "rbbinode.h"
30
31 #include "rbbiscan.h"
32 #include "rbbisetb.h"
33 #include "rbbitblb.h"
34 #include "rbbidata.h"
35
36
37 U_NAMESPACE_BEGIN
38
39
40 //----------------------------------------------------------------------------------------
41 //
42 // Constructor.
43 //
44 //----------------------------------------------------------------------------------------
45 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
46 UParseError &parseErr,
47 UErrorCode &status)
48 : fRules(rules)
49 {
50 fStatus = &status;
51 fParseError = &parseErr;
52 fDebugEnv = NULL;
53 #ifdef RBBI_DEBUG
54 fDebugEnv = getenv("U_RBBIDEBUG");
55 #endif
56
57
58 fForwardTree = NULL;
59 fReverseTree = NULL;
60 fForwardTables = NULL;
61 fReverseTables = NULL;
62 fUSetNodes = new UVector(status);
63 fScanner = new RBBIRuleScanner(this);
64 fSetBuilder = new RBBISetBuilder(this);
65 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0) {
66 status = U_MEMORY_ALLOCATION_ERROR;
67 }
68 }
69
70
71
72 //----------------------------------------------------------------------------------------
73 //
74 // Destructor
75 //
76 //----------------------------------------------------------------------------------------
77 RBBIRuleBuilder::~RBBIRuleBuilder() {
78
79 int i;
80 for (i=0; ; i++) {
81 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
82 if (n==NULL) {
83 break;
84 }
85 delete n;
86 }
87
88 delete fUSetNodes;
89 delete fSetBuilder;
90 delete fForwardTables;
91 delete fReverseTables;
92 delete fForwardTree;
93 delete fReverseTree;
94 delete fScanner;
95 }
96
97
98
99
100
101 //----------------------------------------------------------------------------------------
102 //
103 // flattenData() - Collect up the compiled RBBI rule data and put it into
104 // the format for saving in ICU data files,
105 // which is also the format needed by the RBBI runtime engine.
106 //
107 //----------------------------------------------------------------------------------------
108 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
109
110 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
111 if (U_FAILURE(*fStatus)) {
112 return NULL;
113 }
114
115 // Remove comments and whitespace from the rules to make it smaller.
116 UnicodeString strippedRules(RBBIRuleScanner::stripRules(fRules));
117
118 // Calculate the size of each section in the data.
119 // Sizes here are padded up to a multiple of 8 for better memory alignment.
120 // Sections sizes actually stored in the header are for the actual data
121 // without the padding.
122 //
123 int32_t headerSize = align8(sizeof(RBBIDataHeader));
124 int32_t forwardTableSize = align8(fForwardTables->getTableSize());
125 int32_t reverseTableSize = align8(fReverseTables->getTableSize());
126 int32_t trieSize = align8(fSetBuilder->getTrieSize());
127 int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
128
129 int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
130 + trieSize + rulesSize;
131 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
132 if (data == NULL) {
133 *fStatus = U_MEMORY_ALLOCATION_ERROR;
134 return NULL;
135 }
136 uprv_memset(data, 0, totalSize);
137
138
139 data->fMagic = 0xb1a0;
140 data->fVersion = 1;
141 data->fLength = totalSize;
142 data->fCatCount = fSetBuilder->getNumCharCategories();
143
144 data->fFTable = headerSize;
145 data->fFTableLen = forwardTableSize;
146 data->fRTable = data->fFTable + forwardTableSize;
147 data->fRTableLen = reverseTableSize;
148 data->fTrie = data->fRTable + reverseTableSize;
149 data->fTrieLen = fSetBuilder->getTrieSize();
150 data->fRuleSource = data->fTrie + trieSize;
151 data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
152
153 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
154
155 fForwardTables->exportTable((uint8_t *)data + data->fFTable);
156 fReverseTables->exportTable((uint8_t *)data + data->fRTable);
157 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
158 strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
159
160 return data;
161 }
162
163
164
165
166
167
168 //----------------------------------------------------------------------------------------
169 //
170 // createRuleBasedBreakIterator construct from source rules that are passed in
171 // in a UnicodeString
172 //
173 //----------------------------------------------------------------------------------------
174 BreakIterator *
175 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
176 UParseError &parseError,
177 UErrorCode &status)
178 {
179 if (U_FAILURE(status)) {
180 return NULL;
181 }
182
183 //
184 // Read the input rules, generate a parse tree, symbol table,
185 // and list of all Unicode Sets referenced by the rules.
186 //
187 RBBIRuleBuilder builder(rules, parseError, status);
188 builder.fScanner->parse();
189 if (U_FAILURE(status)) {
190 return NULL;
191 }
192
193 //
194 // UnicodeSet processing.
195 // Munge the Unicode Sets to create a set of character categories.
196 // Generate the mapping tables (TRIE) from input 32-bit characters to
197 // the character categories.
198 //
199 builder.fSetBuilder->build();
200
201
202 //
203 // Generate the DFA state transition table.
204 //
205 builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
206 builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
207 if(builder.fForwardTables == NULL || builder.fReverseTables == NULL) {
208 status = U_MEMORY_ALLOCATION_ERROR;
209 return NULL;
210 }
211
212 builder.fForwardTables->build();
213 builder.fReverseTables->build();
214 if (U_FAILURE(status)) {
215 return NULL;
216 }
217
218
219 //
220 // Package up the compiled data into a memory image
221 // in the run-time format.
222 //
223 RBBIDataHeader *data;
224 data = builder.flattenData();
225
226
227 //
228 // Clean up the compiler related stuff
229 //
230
231
232 //
233 // Create a break iterator from the compiled rules.
234 // (Identical to creation from stored pre-compiled rules)
235 //
236 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
237 /* test for NULL */
238 if(This == NULL) {
239 status = U_MEMORY_ALLOCATION_ERROR;
240 return NULL;
241 }
242
243 if (U_FAILURE(status)) {
244 delete This;
245 This = NULL;
246 }
247 return This;
248 }
249
250 U_NAMESPACE_END
251
252 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */