]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | *************************************************************************** | |
b331163b | 5 | * Copyright (C) 1999-2014 International Business Machines Corporation * |
b75a7d8f A |
6 | * and others. All rights reserved. * |
7 | *************************************************************************** | |
8 | */ | |
9 | ||
10 | #include "unicode/utypes.h" | |
11 | ||
12 | #if !UCONFIG_NO_BREAK_ITERATION | |
13 | ||
14 | #include "unicode/utypes.h" | |
15 | #include "rbbidata.h" | |
16 | #include "rbbirb.h" | |
0f5d89e8 | 17 | #include "utrie2.h" |
b75a7d8f A |
18 | #include "udatamem.h" |
19 | #include "cmemory.h" | |
20 | #include "cstring.h" | |
21 | #include "umutex.h" | |
22 | ||
23 | #include "uassert.h" | |
24 | ||
25 | ||
b75a7d8f A |
26 | U_NAMESPACE_BEGIN |
27 | ||
28 | //----------------------------------------------------------------------------- | |
29 | // | |
30 | // Constructors. | |
31 | // | |
32 | //----------------------------------------------------------------------------- | |
33 | RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) { | |
b331163b | 34 | init0(); |
b75a7d8f A |
35 | init(data, status); |
36 | } | |
37 | ||
46f4442e | 38 | RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { |
b331163b | 39 | init0(); |
46f4442e A |
40 | init(data, status); |
41 | fDontFreeData = TRUE; | |
42 | } | |
43 | ||
b75a7d8f | 44 | RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { |
b331163b A |
45 | init0(); |
46 | if (U_FAILURE(status)) { | |
47 | return; | |
48 | } | |
49 | const DataHeader *dh = udm->pHeader; | |
50 | int32_t headerSize = dh->dataHeader.headerSize; | |
51 | if ( !(headerSize >= 20 && | |
52 | dh->info.isBigEndian == U_IS_BIG_ENDIAN && | |
53 | dh->info.charsetFamily == U_CHARSET_FAMILY && | |
54 | dh->info.dataFormat[0] == 0x42 && // dataFormat="Brk " | |
55 | dh->info.dataFormat[1] == 0x72 && | |
56 | dh->info.dataFormat[2] == 0x6b && | |
0f5d89e8 A |
57 | dh->info.dataFormat[3] == 0x20 && |
58 | isDataVersionAcceptable(dh->info.formatVersion)) | |
b331163b A |
59 | ) { |
60 | status = U_INVALID_FORMAT_ERROR; | |
61 | return; | |
62 | } | |
63 | const char *dataAsBytes = reinterpret_cast<const char *>(dh); | |
64 | const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize); | |
65 | init(rbbidh, status); | |
b75a7d8f A |
66 | fUDataMem = udm; |
67 | } | |
68 | ||
0f5d89e8 A |
69 | UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) { |
70 | return RBBI_DATA_FORMAT_VERSION[0] == version[0]; | |
71 | } | |
72 | ||
73 | ||
b75a7d8f A |
74 | //----------------------------------------------------------------------------- |
75 | // | |
76 | // init(). Does most of the work of construction, shared between the | |
77 | // constructors. | |
78 | // | |
79 | //----------------------------------------------------------------------------- | |
b331163b A |
80 | void RBBIDataWrapper::init0() { |
81 | fHeader = NULL; | |
82 | fForwardTable = NULL; | |
83 | fReverseTable = NULL; | |
0f5d89e8 | 84 | fRuleSource = NULL; |
b331163b | 85 | fRuleStatusTable = NULL; |
0f5d89e8 A |
86 | fTrie = NULL; |
87 | fUDataMem = NULL; | |
88 | fRefCount = 0; | |
b331163b A |
89 | fDontFreeData = TRUE; |
90 | } | |
91 | ||
b75a7d8f A |
92 | void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { |
93 | if (U_FAILURE(status)) { | |
94 | return; | |
95 | } | |
96 | fHeader = data; | |
0f5d89e8 | 97 | if (fHeader->fMagic != 0xb1a0 || !isDataVersionAcceptable(fHeader->fFormatVersion)) { |
73c04bcf | 98 | status = U_INVALID_FORMAT_ERROR; |
b75a7d8f A |
99 | return; |
100 | } | |
729e4ab9 A |
101 | // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 |
102 | // that is no longer supported. At that time fFormatVersion was | |
103 | // an int32_t field, rather than an array of 4 bytes. | |
b75a7d8f | 104 | |
46f4442e | 105 | fDontFreeData = FALSE; |
374ca955 A |
106 | if (data->fFTableLen != 0) { |
107 | fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); | |
108 | } | |
b75a7d8f A |
109 | if (data->fRTableLen != 0) { |
110 | fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); | |
111 | } | |
112 | ||
0f5d89e8 A |
113 | fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
114 | (uint8_t *)data + fHeader->fTrie, | |
115 | fHeader->fTrieLen, | |
116 | NULL, // *actual length | |
117 | &status); | |
b75a7d8f A |
118 | if (U_FAILURE(status)) { |
119 | return; | |
120 | } | |
b75a7d8f A |
121 | |
122 | fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); | |
123 | fRuleString.setTo(TRUE, fRuleSource, -1); | |
374ca955 A |
124 | U_ASSERT(data->fRuleSourceLen > 0); |
125 | ||
126 | fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); | |
127 | fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); | |
b75a7d8f A |
128 | |
129 | fRefCount = 1; | |
130 | ||
131 | #ifdef RBBI_DEBUG | |
132 | char *debugEnv = getenv("U_RBBIDEBUG"); | |
133 | if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} | |
134 | #endif | |
135 | } | |
136 | ||
137 | ||
138 | //----------------------------------------------------------------------------- | |
139 | // | |
374ca955 | 140 | // Destructor. Don't call this - use removeReference() instead. |
b75a7d8f A |
141 | // |
142 | //----------------------------------------------------------------------------- | |
143 | RBBIDataWrapper::~RBBIDataWrapper() { | |
144 | U_ASSERT(fRefCount == 0); | |
0f5d89e8 A |
145 | utrie2_close(fTrie); |
146 | fTrie = NULL; | |
b75a7d8f A |
147 | if (fUDataMem) { |
148 | udata_close(fUDataMem); | |
46f4442e | 149 | } else if (!fDontFreeData) { |
b75a7d8f A |
150 | uprv_free((void *)fHeader); |
151 | } | |
152 | } | |
153 | ||
154 | ||
155 | ||
156 | //----------------------------------------------------------------------------- | |
157 | // | |
158 | // Operator == Consider two RBBIDataWrappers to be equal if they | |
159 | // refer to the same underlying data. Although | |
160 | // the data wrappers are normally shared between | |
161 | // iterator instances, it's possible to independently | |
162 | // open the same data twice, and get two instances, which | |
163 | // should still be ==. | |
164 | // | |
165 | //----------------------------------------------------------------------------- | |
166 | UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { | |
167 | if (fHeader == other.fHeader) { | |
168 | return TRUE; | |
169 | } | |
170 | if (fHeader->fLength != other.fHeader->fLength) { | |
171 | return FALSE; | |
172 | } | |
173 | if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { | |
174 | return TRUE; | |
175 | } | |
176 | return FALSE; | |
177 | } | |
178 | ||
179 | int32_t RBBIDataWrapper::hashCode() { | |
180 | return fHeader->fFTableLen; | |
181 | } | |
182 | ||
183 | ||
184 | ||
185 | //----------------------------------------------------------------------------- | |
186 | // | |
187 | // Reference Counting. A single RBBIDataWrapper object is shared among | |
188 | // however many RulesBasedBreakIterator instances are | |
189 | // referencing the same data. | |
190 | // | |
191 | //----------------------------------------------------------------------------- | |
192 | void RBBIDataWrapper::removeReference() { | |
193 | if (umtx_atomic_dec(&fRefCount) == 0) { | |
194 | delete this; | |
195 | } | |
196 | } | |
197 | ||
198 | ||
199 | RBBIDataWrapper *RBBIDataWrapper::addReference() { | |
200 | umtx_atomic_inc(&fRefCount); | |
201 | return this; | |
202 | } | |
203 | ||
204 | ||
205 | ||
206 | //----------------------------------------------------------------------------- | |
207 | // | |
208 | // getRuleSourceString | |
209 | // | |
210 | //----------------------------------------------------------------------------- | |
374ca955 | 211 | const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { |
b75a7d8f A |
212 | return fRuleString; |
213 | } | |
214 | ||
215 | ||
216 | //----------------------------------------------------------------------------- | |
217 | // | |
218 | // print - debugging function to dump the runtime data tables. | |
219 | // | |
220 | //----------------------------------------------------------------------------- | |
b75a7d8f | 221 | #ifdef RBBI_DEBUG |
374ca955 A |
222 | void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { |
223 | uint32_t c; | |
224 | uint32_t s; | |
b75a7d8f | 225 | |
374ca955 | 226 | RBBIDebugPrintf(" %s\n", heading); |
b75a7d8f | 227 | |
374ca955 | 228 | RBBIDebugPrintf("State | Acc LA TagIx"); |
b75a7d8f | 229 | for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);} |
374ca955 A |
230 | RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) { |
231 | RBBIDebugPrintf("----"); | |
232 | } | |
b75a7d8f A |
233 | RBBIDebugPrintf("\n"); |
234 | ||
374ca955 A |
235 | if (table == NULL) { |
236 | RBBIDebugPrintf(" N U L L T A B L E\n\n"); | |
237 | return; | |
238 | } | |
239 | for (s=0; s<table->fNumStates; s++) { | |
b75a7d8f | 240 | RBBIStateTableRow *row = (RBBIStateTableRow *) |
374ca955 A |
241 | (table->fTableData + (table->fRowLen * s)); |
242 | RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx); | |
b75a7d8f A |
243 | for (c=0; c<fHeader->fCatCount; c++) { |
244 | RBBIDebugPrintf("%3d ", row->fNextState[c]); | |
245 | } | |
246 | RBBIDebugPrintf("\n"); | |
247 | } | |
374ca955 A |
248 | RBBIDebugPrintf("\n"); |
249 | } | |
250 | #endif | |
251 | ||
252 | ||
374ca955 | 253 | void RBBIDataWrapper::printData() { |
0f5d89e8 | 254 | #ifdef RBBI_DEBUG |
374ca955 | 255 | RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); |
73c04bcf A |
256 | RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], |
257 | fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); | |
374ca955 A |
258 | RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); |
259 | RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); | |
260 | ||
261 | printTable("Forward State Transition Table", fForwardTable); | |
262 | printTable("Reverse State Transition Table", fReverseTable); | |
b75a7d8f A |
263 | |
264 | RBBIDebugPrintf("\nOrignal Rules source:\n"); | |
374ca955 | 265 | for (int32_t c=0; fRuleSource[c] != 0; c++) { |
b75a7d8f | 266 | RBBIDebugPrintf("%c", fRuleSource[c]); |
b75a7d8f A |
267 | } |
268 | RBBIDebugPrintf("\n\n"); | |
374ca955 | 269 | #endif |
0f5d89e8 | 270 | } |
b75a7d8f A |
271 | |
272 | ||
273 | U_NAMESPACE_END | |
46f4442e | 274 | U_NAMESPACE_USE |
b75a7d8f | 275 | |
374ca955 A |
276 | //----------------------------------------------------------------------------- |
277 | // | |
278 | // ubrk_swap - byte swap and char encoding swap of RBBI data | |
279 | // | |
280 | //----------------------------------------------------------------------------- | |
281 | ||
282 | U_CAPI int32_t U_EXPORT2 | |
283 | ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, | |
284 | UErrorCode *status) { | |
285 | ||
286 | if (status == NULL || U_FAILURE(*status)) { | |
287 | return 0; | |
288 | } | |
73c04bcf A |
289 | if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { |
290 | *status=U_ILLEGAL_ARGUMENT_ERROR; | |
291 | return 0; | |
292 | } | |
374ca955 A |
293 | |
294 | // | |
295 | // Check that the data header is for for break data. | |
296 | // (Header contents are defined in genbrk.cpp) | |
297 | // | |
298 | const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); | |
299 | if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ | |
300 | pInfo->dataFormat[1]==0x72 && | |
301 | pInfo->dataFormat[2]==0x6b && | |
302 | pInfo->dataFormat[3]==0x20 && | |
0f5d89e8 | 303 | RBBIDataWrapper::isDataVersionAcceptable(pInfo->formatVersion) )) { |
374ca955 A |
304 | udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", |
305 | pInfo->dataFormat[0], pInfo->dataFormat[1], | |
306 | pInfo->dataFormat[2], pInfo->dataFormat[3], | |
307 | pInfo->formatVersion[0]); | |
308 | *status=U_UNSUPPORTED_ERROR; | |
309 | return 0; | |
310 | } | |
311 | ||
312 | // | |
313 | // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific | |
314 | // RBBIDataHeader). This swap also conveniently gets us | |
315 | // the size of the ICU d.h., which lets us locate the start | |
316 | // of the RBBI specific data. | |
317 | // | |
318 | int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); | |
319 | ||
320 | ||
321 | // | |
322 | // Get the RRBI Data Header, and check that it appears to be OK. | |
323 | // | |
324 | const uint8_t *inBytes =(const uint8_t *)inData+headerSize; | |
325 | RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; | |
729e4ab9 | 326 | if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || |
0f5d89e8 A |
327 | !RBBIDataWrapper::isDataVersionAcceptable(rbbiDH->fFormatVersion) || |
328 | ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) { | |
374ca955 A |
329 | udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n"); |
330 | *status=U_UNSUPPORTED_ERROR; | |
331 | return 0; | |
332 | } | |
333 | ||
334 | // | |
335 | // Prefight operation? Just return the size | |
336 | // | |
73c04bcf A |
337 | int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); |
338 | int32_t totalSize = headerSize + breakDataLength; | |
374ca955 A |
339 | if (length < 0) { |
340 | return totalSize; | |
341 | } | |
342 | ||
343 | // | |
344 | // Check that length passed in is consistent with length from RBBI data header. | |
345 | // | |
73c04bcf A |
346 | if (length < totalSize) { |
347 | udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n", | |
348 | breakDataLength); | |
349 | *status=U_INDEX_OUTOFBOUNDS_ERROR; | |
350 | return 0; | |
374ca955 | 351 | } |
374ca955 A |
352 | |
353 | ||
354 | // | |
355 | // Swap the Data. Do the data itself first, then the RBBI Data Header, because | |
356 | // we need to reference the header to locate the data, and an | |
357 | // inplace swap of the header leaves it unusable. | |
358 | // | |
73c04bcf A |
359 | uint8_t *outBytes = (uint8_t *)outData + headerSize; |
360 | RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; | |
361 | ||
374ca955 A |
362 | int32_t tableStartOffset; |
363 | int32_t tableLength; | |
364 | ||
365 | // | |
366 | // If not swapping in place, zero out the output buffer before starting. | |
367 | // Individual tables and other data items within are aligned to 8 byte boundaries | |
368 | // when originally created. Any unused space between items needs to be zero. | |
369 | // | |
370 | if (inBytes != outBytes) { | |
73c04bcf | 371 | uprv_memset(outBytes, 0, breakDataLength); |
374ca955 A |
372 | } |
373 | ||
374 | // | |
375 | // Each state table begins with several 32 bit fields. Calculate the size | |
376 | // in bytes of these. | |
377 | // | |
46f4442e | 378 | int32_t topSize = offsetof(RBBIStateTable, fTableData); |
374ca955 A |
379 | |
380 | // Forward state table. | |
381 | tableStartOffset = ds->readUInt32(rbbiDH->fFTable); | |
382 | tableLength = ds->readUInt32(rbbiDH->fFTableLen); | |
383 | ||
384 | if (tableLength > 0) { | |
385 | ds->swapArray32(ds, inBytes+tableStartOffset, topSize, | |
386 | outBytes+tableStartOffset, status); | |
387 | ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, | |
388 | outBytes+tableStartOffset+topSize, status); | |
389 | } | |
390 | ||
391 | // Reverse state table. Same layout as forward table, above. | |
392 | tableStartOffset = ds->readUInt32(rbbiDH->fRTable); | |
393 | tableLength = ds->readUInt32(rbbiDH->fRTableLen); | |
394 | ||
395 | if (tableLength > 0) { | |
396 | ds->swapArray32(ds, inBytes+tableStartOffset, topSize, | |
397 | outBytes+tableStartOffset, status); | |
398 | ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, | |
399 | outBytes+tableStartOffset+topSize, status); | |
400 | } | |
401 | ||
374ca955 | 402 | // Trie table for character categories |
0f5d89e8 A |
403 | utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), |
404 | outBytes+ds->readUInt32(rbbiDH->fTrie), status); | |
374ca955 A |
405 | |
406 | // Source Rules Text. It's UChar data | |
407 | ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen), | |
408 | outBytes+ds->readUInt32(rbbiDH->fRuleSource), status); | |
409 | ||
410 | // Table of rule status values. It's all int_32 values | |
411 | ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), | |
412 | outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); | |
413 | ||
73c04bcf | 414 | // And, last, the header. |
729e4ab9 A |
415 | // It is all int32_t values except for fFormataVersion, which is an array of four bytes. |
416 | // Swap the whole thing as int32_t, then re-swap the one field. | |
73c04bcf A |
417 | // |
418 | ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); | |
729e4ab9 | 419 | ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); |
374ca955 A |
420 | |
421 | return totalSize; | |
422 | } | |
423 | ||
424 | ||
b75a7d8f | 425 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |