]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/urbtok.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / urbtok.cpp
CommitLineData
73c04bcf
A
1/*
2*****************************************************************************************
46f4442e 3* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.
73c04bcf
A
4*****************************************************************************************
5*/
6
7#include "unicode/utypes.h"
8
9#if !UCONFIG_NO_BREAK_ITERATION
10
11#include "unicode/urbtok.h"
12
73c04bcf 13#include "unicode/ustring.h"
0f5d89e8
A
14#include "unicode/rbbi.h"
15#include "rbbirb.h"
73c04bcf 16#include "rbbidata.h"
0f5d89e8
A
17#include "rbbidata57.h"
18#include "rbtok.h"
73c04bcf
A
19#include "cmemory.h"
20#include "ucmndata.h"
21
22U_NAMESPACE_USE
23
0f5d89e8
A
24U_CAPI UBreakIterator* U_EXPORT2
25urbtok_open(UBreakIteratorType type,
26 const char *locale,
27 UErrorCode *status)
28{
29 UBreakIterator* result = ubrk_open(type, locale, NULL, 0, status);
30 if(U_SUCCESS(*status)) {
31 ((RuleBasedBreakIterator*)result)->initLatin1Cat();
32 }
33 return result;
34}
35
73c04bcf
A
36U_CAPI UBreakIterator* U_EXPORT2
37urbtok_openRules(const UChar *rules,
38 int32_t rulesLength,
39 UParseError *parseErr,
40 UErrorCode *status)
41{
42 if (status == NULL || U_FAILURE(*status)){
43 return 0;
44 }
73c04bcf 45 UnicodeString ruleString(rules, rulesLength);
0f5d89e8 46 BreakIterator *result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, parseErr, *status);
73c04bcf
A
47 if(U_FAILURE(*status)) {
48 return 0;
49 }
0f5d89e8
A
50 ((RuleBasedBreakIterator*)result)->initLatin1Cat();
51 return (UBreakIterator *)result;
73c04bcf
A
52}
53
54U_CAPI UBreakIterator* U_EXPORT2
55urbtok_openBinaryRules(const uint8_t *rules,
56 UErrorCode *status)
57{
58 if (status == NULL || U_FAILURE(*status)){
59 return 0;
60 }
73c04bcf
A
61 uint32_t length = ((const RBBIDataHeader *)rules)->fLength;
62 uint8_t *ruleCopy = (uint8_t *) uprv_malloc(length);
63 if (ruleCopy == 0)
64 {
65 *status = U_MEMORY_ALLOCATION_ERROR;
66 return 0;
67 }
68 // Copy the rules so they can be adopted by the tokenizer
69 uprv_memcpy(ruleCopy, rules, length);
0f5d89e8
A
70 // The following intended-to-be-private constructor does adopt the rules.
71 BreakIterator *result = new RuleBasedBreakIterator((RBBIDataHeader *)ruleCopy, *status);
73c04bcf
A
72 if(U_FAILURE(*status)) {
73 return 0;
74 }
0f5d89e8
A
75 ((RuleBasedBreakIterator*)result)->initLatin1Cat();
76 return (UBreakIterator *)result;
73c04bcf
A
77}
78
46f4442e
A
79U_CAPI UBreakIterator* U_EXPORT2
80urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
81 UErrorCode *status)
82{
83 if (status == NULL || U_FAILURE(*status)){
84 return 0;
85 }
0f5d89e8 86 uint32_t length = ((const RBBIDataHeader *)rules)->fLength;
46f4442e 87
0f5d89e8
A
88 // The following public constructor does not adopt the rules
89 BreakIterator *result = new RuleBasedBreakIterator(rules, length, *status);
46f4442e
A
90 if(U_FAILURE(*status)) {
91 return 0;
92 }
0f5d89e8
A
93 ((RuleBasedBreakIterator*)result)->initLatin1Cat();
94 return (UBreakIterator *)result;
46f4442e
A
95}
96
73c04bcf
A
97U_CAPI uint32_t U_EXPORT2
98urbtok_getBinaryRules(UBreakIterator *bi,
99 uint8_t *buffer,
100 uint32_t buffSize,
101 UErrorCode *status)
102{
103 if (status == NULL || U_FAILURE(*status)){
104 return 0;
105 }
0f5d89e8
A
106 if (buffer == NULL && buffSize > 0) {
107 *status = U_ILLEGAL_ARGUMENT_ERROR;
108 return 0;
109 }
110 RuleBasedBreakIterator *rbbi;
111 if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) {
112 *status = U_ILLEGAL_ARGUMENT_ERROR;
113 return 0;
114 }
73c04bcf 115 uint32_t length;
0f5d89e8 116 const uint8_t *rules = rbbi->getBinaryRules(length);
73c04bcf
A
117 if (buffer != 0)
118 {
0f5d89e8 119 if (length > buffSize) {
73c04bcf
A
120 *status = U_BUFFER_OVERFLOW_ERROR;
121 }
0f5d89e8 122 else {
73c04bcf
A
123 uprv_memcpy(buffer, rules, length);
124 }
125 }
126 return length;
127}
128
129U_CAPI int32_t U_EXPORT2
130urbtok_tokenize(UBreakIterator *bi,
131 int32_t maxTokens,
132 RuleBasedTokenRange *outTokens,
133 unsigned long *outTokenFlags)
134{
0f5d89e8
A
135 // Using dynamic_cast/reinterpret_cast has a significant performance impact
136 // on calls to urbtok_tokenize with maxTokens=1, use a regular cast instead
137 //RuleBasedBreakIterator *rbbi;
138 //if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) {
139 // return 0;
140 //}
141 //return rbbi->tokenize(maxTokens, outTokens, outTokenFlags);
142 if (bi == NULL || outTokens == NULL) {
143 return 0;
144 }
145 return ((RuleBasedBreakIterator*)bi)->tokenize(maxTokens, outTokens, outTokenFlags);
73c04bcf
A
146}
147
148U_CAPI void U_EXPORT2
149urbtok_swapBinaryRules(const uint8_t *rules,
150 uint8_t *buffer,
151 UBool inIsBigEndian,
152 UBool outIsBigEndian,
153 UErrorCode *status)
154{
46f4442e 155 DataHeader *outH = NULL;
3d1f044b 156 uint32_t outLength = 0;
73c04bcf
A
157 UDataSwapper *ds = udata_openSwapper(inIsBigEndian, U_CHARSET_FAMILY, outIsBigEndian, U_CHARSET_FAMILY, status);
158
159 if (status == NULL || U_FAILURE(*status)){
160 return;
161 }
162
163 uint32_t length = ds->readUInt32(((const RBBIDataHeader *)rules)->fLength);
164 uint32_t totalLength = sizeof(DataHeader) + length;
165
166 DataHeader *dh = (DataHeader *)uprv_malloc(totalLength);
167 if (dh == 0)
168 {
169 *status = U_MEMORY_ALLOCATION_ERROR;
170 goto closeSwapper;
171 }
46f4442e 172 outH = (DataHeader *)uprv_malloc(totalLength);
73c04bcf
A
173 if (outH == 0)
174 {
175 *status = U_MEMORY_ALLOCATION_ERROR;
176 uprv_free(dh);
177 goto closeSwapper;
178 }
179 dh->dataHeader.headerSize = ds->readUInt16(sizeof(DataHeader));
180 dh->dataHeader.magic1 = 0xda;
181 dh->dataHeader.magic2 = 0x27;
182 dh->info.size = ds->readUInt16(sizeof(UDataInfo));
183 dh->info.reservedWord = 0;
184 dh->info.isBigEndian = inIsBigEndian;
185 dh->info.charsetFamily = U_CHARSET_FAMILY;
186 dh->info.sizeofUChar = U_SIZEOF_UCHAR;
187 dh->info.reservedByte = 0;
188 uprv_memcpy(dh->info.dataFormat, "Brk ", sizeof(dh->info.dataFormat));
189 uprv_memcpy(dh->info.formatVersion, ((const RBBIDataHeader *)rules)->fFormatVersion, sizeof(dh->info.formatVersion));
190 dh->info.dataVersion[0] = 4; // Unicode version
191 dh->info.dataVersion[1] = 1;
192 dh->info.dataVersion[2] = 0;
193 dh->info.dataVersion[3] = 0;
194 uprv_memcpy(((uint8_t*)dh) + sizeof(DataHeader), rules, length);
195
3d1f044b 196 outLength = (uint32_t)ubrk_swap(ds, dh, totalLength, outH, status);
73c04bcf
A
197 if (U_SUCCESS(*status) && outLength != totalLength) // something went horribly wrong
198 {
199 *status = U_INVALID_FORMAT_ERROR;
200 }
201
202 if (U_SUCCESS(*status))
203 {
204 uprv_memcpy(buffer, ((uint8_t *)outH) + sizeof(DataHeader), length);
205 }
206 uprv_free(outH);
207 uprv_free(dh);
208
209closeSwapper:
210 udata_closeSwapper(ds);
211}
212
0f5d89e8
A
213U_CAPI UBreakIterator* U_EXPORT2
214urbtok57_openRules(const UChar *rules,
215 int32_t rulesLength,
216 UParseError *parseErr,
217 UErrorCode *status)
218{
219 if (status == NULL || U_FAILURE(*status)){
220 return 0;
221 }
222
223 BreakIterator *result = 0;
224 UnicodeString ruleString(rules, rulesLength);
225 result = new RuleBasedTokenizer(ruleString, *parseErr, *status);
226 if(U_FAILURE(*status)) {
227 return 0;
228 }
229
230 UBreakIterator *uBI = (UBreakIterator *)result;
231 return uBI;
232}
233
234U_CAPI UBreakIterator* U_EXPORT2
235urbtok57_openBinaryRules(const uint8_t *rules,
236 UErrorCode *status)
237{
238 if (status == NULL || U_FAILURE(*status)){
239 return 0;
240 }
241
242 uint32_t length = ((const RBBIDataHeader57 *)rules)->fLength;
243 uint8_t *ruleCopy = (uint8_t *) uprv_malloc(length);
244 if (ruleCopy == 0)
245 {
246 *status = U_MEMORY_ALLOCATION_ERROR;
247 return 0;
248 }
249 // Copy the rules so they can be adopted by the tokenizer
250 uprv_memcpy(ruleCopy, rules, length);
251 BreakIterator *result = 0;
252 result = new RuleBasedTokenizer(ruleCopy, *status);
253 if(U_FAILURE(*status)) {
254 return 0;
255 }
256
257 UBreakIterator *uBI = (UBreakIterator *)result;
258 return uBI;
259}
260
261U_CAPI UBreakIterator* U_EXPORT2
262urbtok57_openBinaryRulesNoCopy(const uint8_t *rules,
263 UErrorCode *status)
264{
265 if (status == NULL || U_FAILURE(*status)){
266 return 0;
267 }
268
269 BreakIterator *result = 0;
270 result = new RuleBasedTokenizer(rules, RuleBasedTokenizer::kDontAdopt, *status);
271 if(U_FAILURE(*status)) {
272 return 0;
273 }
274
275 UBreakIterator *uBI = (UBreakIterator *)result;
276 return uBI;
277}
278
279U_CAPI uint32_t U_EXPORT2
280urbtok57_getBinaryRules(UBreakIterator *bi,
281 uint8_t *buffer,
282 uint32_t buffSize,
283 UErrorCode *status)
284{
285 if (status == NULL || U_FAILURE(*status)){
286 return 0;
287 }
288 if (buffer == NULL && buffSize > 0) {
289 *status = U_ILLEGAL_ARGUMENT_ERROR;
290 return 0;
291 }
292 RuleBasedBreakIterator57 *rbbi57;
293 if ((rbbi57 = dynamic_cast<RuleBasedBreakIterator57*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) {
294 *status = U_ILLEGAL_ARGUMENT_ERROR;
295 return 0;
296 }
297 uint32_t length;
298 const uint8_t *rules = rbbi57->getBinaryRules(length);
299 if (buffer != 0)
300 {
301 if (length > buffSize) {
302 *status = U_BUFFER_OVERFLOW_ERROR;
303 }
304 else {
305 uprv_memcpy(buffer, rules, length);
306 }
307 }
308 return length;
309}
310
311U_CAPI int32_t U_EXPORT2
312urbtok57_tokenize(UBreakIterator *bi,
313 int32_t maxTokens,
314 RuleBasedTokenRange *outTokens,
315 unsigned long *outTokenFlags)
316{
317 return ((RuleBasedTokenizer *)bi)->tokenize(maxTokens, outTokens, outTokenFlags);
318}
73c04bcf
A
319
320#endif /* #if !UCONFIG_NO_BREAK_ITERATION */