]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbidata.cpp
ICU-6.2.8.tar.gz
[apple/icu.git] / icuSources / common / rbbidata.cpp
1 /*
2 ***************************************************************************
3 * Copyright (C) 1999-2004 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 ***************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_BREAK_ITERATION
11
12 #include "unicode/utypes.h"
13 #include "rbbidata.h"
14 #include "rbbirb.h"
15 #include "utrie.h"
16 #include "udatamem.h"
17 #include "cmemory.h"
18 #include "cstring.h"
19 #include "umutex.h"
20
21 #include "uassert.h"
22
23
24 //-----------------------------------------------------------------------------------
25 //
26 // Trie access folding function. Copied as-is from properties code in uchar.c
27 //
28 //-----------------------------------------------------------------------------------
29 U_CDECL_BEGIN
30 static int32_t U_CALLCONV
31 getFoldingOffset(uint32_t data) {
32 /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
33 if(data&0x8000) {
34 return (int32_t)(data&0x7fff);
35 } else {
36 return 0;
37 }
38 }
39 U_CDECL_END
40
41 U_NAMESPACE_BEGIN
42
43 //-----------------------------------------------------------------------------
44 //
45 // Constructors.
46 //
47 //-----------------------------------------------------------------------------
48 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
49 init(data, status);
50 }
51
52 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
53 const RBBIDataHeader *d = (const RBBIDataHeader *)
54 // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
55 // taking into consideration the padding added in by udata_write
56 ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
57 init(d, status);
58 fUDataMem = udm;
59 }
60
61 //-----------------------------------------------------------------------------
62 //
63 // init(). Does most of the work of construction, shared between the
64 // constructors.
65 //
66 //-----------------------------------------------------------------------------
67 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
68 if (U_FAILURE(status)) {
69 return;
70 }
71 fHeader = data;
72 if (fHeader->fMagic != 0xb1a0) {
73 status = U_BRK_INTERNAL_ERROR;
74 return;
75 }
76
77 fUDataMem = NULL;
78 fReverseTable = NULL;
79 fSafeFwdTable = NULL;
80 fSafeRevTable = NULL;
81 if (data->fFTableLen != 0) {
82 fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
83 }
84 if (data->fRTableLen != 0) {
85 fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
86 }
87 if (data->fSFTableLen != 0) {
88 fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
89 }
90 if (data->fSRTableLen != 0) {
91 fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
92 }
93
94
95 utrie_unserialize(&fTrie,
96 (uint8_t *)data + fHeader->fTrie,
97 fHeader->fTrieLen,
98 &status);
99 if (U_FAILURE(status)) {
100 return;
101 }
102 fTrie.getFoldingOffset=getFoldingOffset;
103
104
105 fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
106 fRuleString.setTo(TRUE, fRuleSource, -1);
107 U_ASSERT(data->fRuleSourceLen > 0);
108
109 fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
110 fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t);
111
112 fRefCount = 1;
113
114 #ifdef RBBI_DEBUG
115 char *debugEnv = getenv("U_RBBIDEBUG");
116 if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
117 #endif
118 }
119
120
121 //-----------------------------------------------------------------------------
122 //
123 // Destructor. Don't call this - use removeReference() instead.
124 //
125 //-----------------------------------------------------------------------------
126 RBBIDataWrapper::~RBBIDataWrapper() {
127 U_ASSERT(fRefCount == 0);
128 if (fUDataMem) {
129 udata_close(fUDataMem);
130 } else {
131 uprv_free((void *)fHeader);
132 }
133 }
134
135
136
137 //-----------------------------------------------------------------------------
138 //
139 // Operator == Consider two RBBIDataWrappers to be equal if they
140 // refer to the same underlying data. Although
141 // the data wrappers are normally shared between
142 // iterator instances, it's possible to independently
143 // open the same data twice, and get two instances, which
144 // should still be ==.
145 //
146 //-----------------------------------------------------------------------------
147 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
148 if (fHeader == other.fHeader) {
149 return TRUE;
150 }
151 if (fHeader->fLength != other.fHeader->fLength) {
152 return FALSE;
153 }
154 if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
155 return TRUE;
156 }
157 return FALSE;
158 }
159
160 int32_t RBBIDataWrapper::hashCode() {
161 return fHeader->fFTableLen;
162 }
163
164
165
166 //-----------------------------------------------------------------------------
167 //
168 // Reference Counting. A single RBBIDataWrapper object is shared among
169 // however many RulesBasedBreakIterator instances are
170 // referencing the same data.
171 //
172 //-----------------------------------------------------------------------------
173 void RBBIDataWrapper::removeReference() {
174 if (umtx_atomic_dec(&fRefCount) == 0) {
175 delete this;
176 }
177 }
178
179
180 RBBIDataWrapper *RBBIDataWrapper::addReference() {
181 umtx_atomic_inc(&fRefCount);
182 return this;
183 }
184
185
186
187 //-----------------------------------------------------------------------------
188 //
189 // getRuleSourceString
190 //
191 //-----------------------------------------------------------------------------
192 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
193 return fRuleString;
194 }
195
196
197 //-----------------------------------------------------------------------------
198 //
199 // print - debugging function to dump the runtime data tables.
200 //
201 //-----------------------------------------------------------------------------
202 #ifdef RBBI_DEBUG
203 void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
204 uint32_t c;
205 uint32_t s;
206
207 RBBIDebugPrintf(" %s\n", heading);
208
209 RBBIDebugPrintf("State | Acc LA TagIx");
210 for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
211 RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
212 RBBIDebugPrintf("----");
213 }
214 RBBIDebugPrintf("\n");
215
216 if (table == NULL) {
217 RBBIDebugPrintf(" N U L L T A B L E\n\n");
218 return;
219 }
220 for (s=0; s<table->fNumStates; s++) {
221 RBBIStateTableRow *row = (RBBIStateTableRow *)
222 (table->fTableData + (table->fRowLen * s));
223 RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
224 for (c=0; c<fHeader->fCatCount; c++) {
225 RBBIDebugPrintf("%3d ", row->fNextState[c]);
226 }
227 RBBIDebugPrintf("\n");
228 }
229 RBBIDebugPrintf("\n");
230 }
231 #endif
232
233
234 #ifdef RBBI_DEBUG
235 void RBBIDataWrapper::printData() {
236 RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
237 RBBIDebugPrintf(" Version = %d\n", fHeader->fVersion);
238 RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength);
239 RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
240
241 printTable("Forward State Transition Table", fForwardTable);
242 printTable("Reverse State Transition Table", fReverseTable);
243 printTable("Safe Forward State Transition Table", fSafeFwdTable);
244 printTable("Safe Reverse State Transition Table", fSafeRevTable);
245
246 RBBIDebugPrintf("\nOrignal Rules source:\n");
247 for (int32_t c=0; fRuleSource[c] != 0; c++) {
248 RBBIDebugPrintf("%c", fRuleSource[c]);
249 }
250 RBBIDebugPrintf("\n\n");
251 }
252 #endif
253
254
255 U_NAMESPACE_END
256
257 //-----------------------------------------------------------------------------
258 //
259 // ubrk_swap - byte swap and char encoding swap of RBBI data
260 //
261 //-----------------------------------------------------------------------------
262
263 U_CAPI int32_t U_EXPORT2
264 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
265 UErrorCode *status) {
266
267 if (status == NULL || U_FAILURE(*status)) {
268 return 0;
269 }
270
271 //
272 // Check that the data header is for for break data.
273 // (Header contents are defined in genbrk.cpp)
274 //
275 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
276 if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */
277 pInfo->dataFormat[1]==0x72 &&
278 pInfo->dataFormat[2]==0x6b &&
279 pInfo->dataFormat[3]==0x20 &&
280 pInfo->formatVersion[0]==3 )) {
281 udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
282 pInfo->dataFormat[0], pInfo->dataFormat[1],
283 pInfo->dataFormat[2], pInfo->dataFormat[3],
284 pInfo->formatVersion[0]);
285 *status=U_UNSUPPORTED_ERROR;
286 return 0;
287 }
288
289 //
290 // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific
291 // RBBIDataHeader). This swap also conveniently gets us
292 // the size of the ICU d.h., which lets us locate the start
293 // of the RBBI specific data.
294 //
295 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
296
297
298 //
299 // Get the RRBI Data Header, and check that it appears to be OK.
300 //
301 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
302 RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
303 if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
304 ds->readUInt32(rbbiDH->fVersion) != 1 ||
305 ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader))
306 {
307 udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
308 *status=U_UNSUPPORTED_ERROR;
309 return 0;
310 }
311
312 //
313 // Prefight operation? Just return the size
314 //
315 int32_t totalSize = headerSize + ds->readUInt32(rbbiDH->fLength);
316 if (length < 0) {
317 return totalSize;
318 }
319
320 //
321 // Check that length passed in is consistent with length from RBBI data header.
322 //
323 if (length > 0) {
324 length -= headerSize;
325 if ((uint32_t)length < ds->readUInt32(rbbiDH->fLength)) {
326 udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
327 length);
328 *status=U_INDEX_OUTOFBOUNDS_ERROR;
329 return 0;
330 }
331 }
332
333
334 //
335 // Swap the Data. Do the data itself first, then the RBBI Data Header, because
336 // we need to reference the header to locate the data, and an
337 // inplace swap of the header leaves it unusable.
338 //
339 uint8_t *outBytes = (uint8_t *)outData + headerSize;
340 int32_t tableStartOffset;
341 int32_t tableLength;
342
343 //
344 // If not swapping in place, zero out the output buffer before starting.
345 // Individual tables and other data items within are aligned to 8 byte boundaries
346 // when originally created. Any unused space between items needs to be zero.
347 //
348 if (inBytes != outBytes) {
349 uprv_memset(outBytes, 0, length);
350 }
351
352 //
353 // Each state table begins with several 32 bit fields. Calculate the size
354 // in bytes of these.
355 //
356 RBBIStateTable *stp = NULL;
357 int32_t topSize = (char *)stp->fTableData - (char *)stp;
358
359 // Forward state table.
360 tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
361 tableLength = ds->readUInt32(rbbiDH->fFTableLen);
362
363 if (tableLength > 0) {
364 ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
365 outBytes+tableStartOffset, status);
366 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
367 outBytes+tableStartOffset+topSize, status);
368 }
369
370 // Reverse state table. Same layout as forward table, above.
371 tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
372 tableLength = ds->readUInt32(rbbiDH->fRTableLen);
373
374 if (tableLength > 0) {
375 ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
376 outBytes+tableStartOffset, status);
377 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
378 outBytes+tableStartOffset+topSize, status);
379 }
380
381 // Safe Forward state table. Same layout as forward table, above.
382 tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
383 tableLength = ds->readUInt32(rbbiDH->fSFTableLen);
384
385 if (tableLength > 0) {
386 ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
387 outBytes+tableStartOffset, status);
388 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
389 outBytes+tableStartOffset+topSize, status);
390 }
391
392 // Safe Reverse state table. Same layout as forward table, above.
393 tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
394 tableLength = ds->readUInt32(rbbiDH->fSRTableLen);
395
396 if (tableLength > 0) {
397 ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
398 outBytes+tableStartOffset, status);
399 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
400 outBytes+tableStartOffset+topSize, status);
401 }
402
403 // Trie table for character categories
404 utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
405 outBytes+ds->readUInt32(rbbiDH->fTrie), status);
406
407 // Source Rules Text. It's UChar data
408 ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
409 outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
410
411 // Table of rule status values. It's all int_32 values
412 ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
413 outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
414
415 // And, last, the header. All 32 bit values.
416 ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
417
418 return totalSize;
419 }
420
421
422 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */