]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ***************************************************************************************** | |
46f4442e | 3 | * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. |
73c04bcf A |
4 | ***************************************************************************************** |
5 | */ | |
6 | ||
7 | #include "unicode/utypes.h" | |
8 | ||
9 | #if !UCONFIG_NO_BREAK_ITERATION | |
10 | ||
11 | #include "unicode/urbtok.h" | |
12 | ||
73c04bcf | 13 | #include "unicode/ustring.h" |
0f5d89e8 A |
14 | #include "unicode/rbbi.h" |
15 | #include "rbbirb.h" | |
73c04bcf | 16 | #include "rbbidata.h" |
0f5d89e8 A |
17 | #include "rbbidata57.h" |
18 | #include "rbtok.h" | |
73c04bcf A |
19 | #include "cmemory.h" |
20 | #include "ucmndata.h" | |
21 | ||
22 | U_NAMESPACE_USE | |
23 | ||
0f5d89e8 A |
24 | U_CAPI UBreakIterator* U_EXPORT2 |
25 | urbtok_open(UBreakIteratorType type, | |
26 | const char *locale, | |
27 | UErrorCode *status) | |
28 | { | |
29 | UBreakIterator* result = ubrk_open(type, locale, NULL, 0, status); | |
30 | if(U_SUCCESS(*status)) { | |
31 | ((RuleBasedBreakIterator*)result)->initLatin1Cat(); | |
32 | } | |
33 | return result; | |
34 | } | |
35 | ||
73c04bcf A |
36 | U_CAPI UBreakIterator* U_EXPORT2 |
37 | urbtok_openRules(const UChar *rules, | |
38 | int32_t rulesLength, | |
39 | UParseError *parseErr, | |
40 | UErrorCode *status) | |
41 | { | |
42 | if (status == NULL || U_FAILURE(*status)){ | |
43 | return 0; | |
44 | } | |
73c04bcf | 45 | UnicodeString ruleString(rules, rulesLength); |
0f5d89e8 | 46 | BreakIterator *result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, parseErr, *status); |
73c04bcf A |
47 | if(U_FAILURE(*status)) { |
48 | return 0; | |
49 | } | |
0f5d89e8 A |
50 | ((RuleBasedBreakIterator*)result)->initLatin1Cat(); |
51 | return (UBreakIterator *)result; | |
73c04bcf A |
52 | } |
53 | ||
54 | U_CAPI UBreakIterator* U_EXPORT2 | |
55 | urbtok_openBinaryRules(const uint8_t *rules, | |
56 | UErrorCode *status) | |
57 | { | |
58 | if (status == NULL || U_FAILURE(*status)){ | |
59 | return 0; | |
60 | } | |
73c04bcf A |
61 | uint32_t length = ((const RBBIDataHeader *)rules)->fLength; |
62 | uint8_t *ruleCopy = (uint8_t *) uprv_malloc(length); | |
63 | if (ruleCopy == 0) | |
64 | { | |
65 | *status = U_MEMORY_ALLOCATION_ERROR; | |
66 | return 0; | |
67 | } | |
68 | // Copy the rules so they can be adopted by the tokenizer | |
69 | uprv_memcpy(ruleCopy, rules, length); | |
0f5d89e8 A |
70 | // The following intended-to-be-private constructor does adopt the rules. |
71 | BreakIterator *result = new RuleBasedBreakIterator((RBBIDataHeader *)ruleCopy, *status); | |
73c04bcf A |
72 | if(U_FAILURE(*status)) { |
73 | return 0; | |
74 | } | |
0f5d89e8 A |
75 | ((RuleBasedBreakIterator*)result)->initLatin1Cat(); |
76 | return (UBreakIterator *)result; | |
73c04bcf A |
77 | } |
78 | ||
46f4442e A |
79 | U_CAPI UBreakIterator* U_EXPORT2 |
80 | urbtok_openBinaryRulesNoCopy(const uint8_t *rules, | |
81 | UErrorCode *status) | |
82 | { | |
83 | if (status == NULL || U_FAILURE(*status)){ | |
84 | return 0; | |
85 | } | |
0f5d89e8 | 86 | uint32_t length = ((const RBBIDataHeader *)rules)->fLength; |
46f4442e | 87 | |
0f5d89e8 A |
88 | // The following public constructor does not adopt the rules |
89 | BreakIterator *result = new RuleBasedBreakIterator(rules, length, *status); | |
46f4442e A |
90 | if(U_FAILURE(*status)) { |
91 | return 0; | |
92 | } | |
0f5d89e8 A |
93 | ((RuleBasedBreakIterator*)result)->initLatin1Cat(); |
94 | return (UBreakIterator *)result; | |
46f4442e A |
95 | } |
96 | ||
73c04bcf A |
97 | U_CAPI uint32_t U_EXPORT2 |
98 | urbtok_getBinaryRules(UBreakIterator *bi, | |
99 | uint8_t *buffer, | |
100 | uint32_t buffSize, | |
101 | UErrorCode *status) | |
102 | { | |
103 | if (status == NULL || U_FAILURE(*status)){ | |
104 | return 0; | |
105 | } | |
0f5d89e8 A |
106 | if (buffer == NULL && buffSize > 0) { |
107 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
108 | return 0; | |
109 | } | |
110 | RuleBasedBreakIterator *rbbi; | |
111 | if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) { | |
112 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
113 | return 0; | |
114 | } | |
73c04bcf | 115 | uint32_t length; |
0f5d89e8 | 116 | const uint8_t *rules = rbbi->getBinaryRules(length); |
73c04bcf A |
117 | if (buffer != 0) |
118 | { | |
0f5d89e8 | 119 | if (length > buffSize) { |
73c04bcf A |
120 | *status = U_BUFFER_OVERFLOW_ERROR; |
121 | } | |
0f5d89e8 | 122 | else { |
73c04bcf A |
123 | uprv_memcpy(buffer, rules, length); |
124 | } | |
125 | } | |
126 | return length; | |
127 | } | |
128 | ||
129 | U_CAPI int32_t U_EXPORT2 | |
130 | urbtok_tokenize(UBreakIterator *bi, | |
131 | int32_t maxTokens, | |
132 | RuleBasedTokenRange *outTokens, | |
133 | unsigned long *outTokenFlags) | |
134 | { | |
0f5d89e8 A |
135 | // Using dynamic_cast/reinterpret_cast has a significant performance impact |
136 | // on calls to urbtok_tokenize with maxTokens=1, use a regular cast instead | |
137 | //RuleBasedBreakIterator *rbbi; | |
138 | //if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) { | |
139 | // return 0; | |
140 | //} | |
141 | //return rbbi->tokenize(maxTokens, outTokens, outTokenFlags); | |
142 | if (bi == NULL || outTokens == NULL) { | |
143 | return 0; | |
144 | } | |
145 | return ((RuleBasedBreakIterator*)bi)->tokenize(maxTokens, outTokens, outTokenFlags); | |
73c04bcf A |
146 | } |
147 | ||
148 | U_CAPI void U_EXPORT2 | |
149 | urbtok_swapBinaryRules(const uint8_t *rules, | |
150 | uint8_t *buffer, | |
151 | UBool inIsBigEndian, | |
152 | UBool outIsBigEndian, | |
153 | UErrorCode *status) | |
154 | { | |
46f4442e A |
155 | DataHeader *outH = NULL; |
156 | int32_t outLength = 0; | |
73c04bcf A |
157 | UDataSwapper *ds = udata_openSwapper(inIsBigEndian, U_CHARSET_FAMILY, outIsBigEndian, U_CHARSET_FAMILY, status); |
158 | ||
159 | if (status == NULL || U_FAILURE(*status)){ | |
160 | return; | |
161 | } | |
162 | ||
163 | uint32_t length = ds->readUInt32(((const RBBIDataHeader *)rules)->fLength); | |
164 | uint32_t totalLength = sizeof(DataHeader) + length; | |
165 | ||
166 | DataHeader *dh = (DataHeader *)uprv_malloc(totalLength); | |
167 | if (dh == 0) | |
168 | { | |
169 | *status = U_MEMORY_ALLOCATION_ERROR; | |
170 | goto closeSwapper; | |
171 | } | |
46f4442e | 172 | outH = (DataHeader *)uprv_malloc(totalLength); |
73c04bcf A |
173 | if (outH == 0) |
174 | { | |
175 | *status = U_MEMORY_ALLOCATION_ERROR; | |
176 | uprv_free(dh); | |
177 | goto closeSwapper; | |
178 | } | |
179 | dh->dataHeader.headerSize = ds->readUInt16(sizeof(DataHeader)); | |
180 | dh->dataHeader.magic1 = 0xda; | |
181 | dh->dataHeader.magic2 = 0x27; | |
182 | dh->info.size = ds->readUInt16(sizeof(UDataInfo)); | |
183 | dh->info.reservedWord = 0; | |
184 | dh->info.isBigEndian = inIsBigEndian; | |
185 | dh->info.charsetFamily = U_CHARSET_FAMILY; | |
186 | dh->info.sizeofUChar = U_SIZEOF_UCHAR; | |
187 | dh->info.reservedByte = 0; | |
188 | uprv_memcpy(dh->info.dataFormat, "Brk ", sizeof(dh->info.dataFormat)); | |
189 | uprv_memcpy(dh->info.formatVersion, ((const RBBIDataHeader *)rules)->fFormatVersion, sizeof(dh->info.formatVersion)); | |
190 | dh->info.dataVersion[0] = 4; // Unicode version | |
191 | dh->info.dataVersion[1] = 1; | |
192 | dh->info.dataVersion[2] = 0; | |
193 | dh->info.dataVersion[3] = 0; | |
194 | uprv_memcpy(((uint8_t*)dh) + sizeof(DataHeader), rules, length); | |
195 | ||
46f4442e | 196 | outLength = ubrk_swap(ds, dh, totalLength, outH, status); |
73c04bcf A |
197 | if (U_SUCCESS(*status) && outLength != totalLength) // something went horribly wrong |
198 | { | |
199 | *status = U_INVALID_FORMAT_ERROR; | |
200 | } | |
201 | ||
202 | if (U_SUCCESS(*status)) | |
203 | { | |
204 | uprv_memcpy(buffer, ((uint8_t *)outH) + sizeof(DataHeader), length); | |
205 | } | |
206 | uprv_free(outH); | |
207 | uprv_free(dh); | |
208 | ||
209 | closeSwapper: | |
210 | udata_closeSwapper(ds); | |
211 | } | |
212 | ||
0f5d89e8 A |
213 | U_CAPI UBreakIterator* U_EXPORT2 |
214 | urbtok57_openRules(const UChar *rules, | |
215 | int32_t rulesLength, | |
216 | UParseError *parseErr, | |
217 | UErrorCode *status) | |
218 | { | |
219 | if (status == NULL || U_FAILURE(*status)){ | |
220 | return 0; | |
221 | } | |
222 | ||
223 | BreakIterator *result = 0; | |
224 | UnicodeString ruleString(rules, rulesLength); | |
225 | result = new RuleBasedTokenizer(ruleString, *parseErr, *status); | |
226 | if(U_FAILURE(*status)) { | |
227 | return 0; | |
228 | } | |
229 | ||
230 | UBreakIterator *uBI = (UBreakIterator *)result; | |
231 | return uBI; | |
232 | } | |
233 | ||
234 | U_CAPI UBreakIterator* U_EXPORT2 | |
235 | urbtok57_openBinaryRules(const uint8_t *rules, | |
236 | UErrorCode *status) | |
237 | { | |
238 | if (status == NULL || U_FAILURE(*status)){ | |
239 | return 0; | |
240 | } | |
241 | ||
242 | uint32_t length = ((const RBBIDataHeader57 *)rules)->fLength; | |
243 | uint8_t *ruleCopy = (uint8_t *) uprv_malloc(length); | |
244 | if (ruleCopy == 0) | |
245 | { | |
246 | *status = U_MEMORY_ALLOCATION_ERROR; | |
247 | return 0; | |
248 | } | |
249 | // Copy the rules so they can be adopted by the tokenizer | |
250 | uprv_memcpy(ruleCopy, rules, length); | |
251 | BreakIterator *result = 0; | |
252 | result = new RuleBasedTokenizer(ruleCopy, *status); | |
253 | if(U_FAILURE(*status)) { | |
254 | return 0; | |
255 | } | |
256 | ||
257 | UBreakIterator *uBI = (UBreakIterator *)result; | |
258 | return uBI; | |
259 | } | |
260 | ||
261 | U_CAPI UBreakIterator* U_EXPORT2 | |
262 | urbtok57_openBinaryRulesNoCopy(const uint8_t *rules, | |
263 | UErrorCode *status) | |
264 | { | |
265 | if (status == NULL || U_FAILURE(*status)){ | |
266 | return 0; | |
267 | } | |
268 | ||
269 | BreakIterator *result = 0; | |
270 | result = new RuleBasedTokenizer(rules, RuleBasedTokenizer::kDontAdopt, *status); | |
271 | if(U_FAILURE(*status)) { | |
272 | return 0; | |
273 | } | |
274 | ||
275 | UBreakIterator *uBI = (UBreakIterator *)result; | |
276 | return uBI; | |
277 | } | |
278 | ||
279 | U_CAPI uint32_t U_EXPORT2 | |
280 | urbtok57_getBinaryRules(UBreakIterator *bi, | |
281 | uint8_t *buffer, | |
282 | uint32_t buffSize, | |
283 | UErrorCode *status) | |
284 | { | |
285 | if (status == NULL || U_FAILURE(*status)){ | |
286 | return 0; | |
287 | } | |
288 | if (buffer == NULL && buffSize > 0) { | |
289 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
290 | return 0; | |
291 | } | |
292 | RuleBasedBreakIterator57 *rbbi57; | |
293 | if ((rbbi57 = dynamic_cast<RuleBasedBreakIterator57*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) { | |
294 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
295 | return 0; | |
296 | } | |
297 | uint32_t length; | |
298 | const uint8_t *rules = rbbi57->getBinaryRules(length); | |
299 | if (buffer != 0) | |
300 | { | |
301 | if (length > buffSize) { | |
302 | *status = U_BUFFER_OVERFLOW_ERROR; | |
303 | } | |
304 | else { | |
305 | uprv_memcpy(buffer, rules, length); | |
306 | } | |
307 | } | |
308 | return length; | |
309 | } | |
310 | ||
311 | U_CAPI int32_t U_EXPORT2 | |
312 | urbtok57_tokenize(UBreakIterator *bi, | |
313 | int32_t maxTokens, | |
314 | RuleBasedTokenRange *outTokens, | |
315 | unsigned long *outTokenFlags) | |
316 | { | |
317 | return ((RuleBasedTokenizer *)bi)->tokenize(maxTokens, outTokens, outTokenFlags); | |
318 | } | |
73c04bcf A |
319 | |
320 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |