2 *****************************************************************************************
3 * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.
4 *****************************************************************************************
7 #include "unicode/utypes.h"
9 #if !UCONFIG_NO_BREAK_ITERATION
11 #include "unicode/urbtok.h"
13 #include "unicode/ustring.h"
14 #include "unicode/rbbi.h"
17 #include "rbbidata57.h"
24 U_CAPI UBreakIterator
* U_EXPORT2
25 urbtok_open(UBreakIteratorType type
,
29 UBreakIterator
* result
= ubrk_open(type
, locale
, NULL
, 0, status
);
30 if(U_SUCCESS(*status
)) {
31 ((RuleBasedBreakIterator
*)result
)->initLatin1Cat();
36 U_CAPI UBreakIterator
* U_EXPORT2
37 urbtok_openRules(const UChar
*rules
,
39 UParseError
*parseErr
,
42 if (status
== NULL
|| U_FAILURE(*status
)){
45 UnicodeString
ruleString(rules
, rulesLength
);
46 BreakIterator
*result
= RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString
, parseErr
, *status
);
47 if(U_FAILURE(*status
)) {
50 ((RuleBasedBreakIterator
*)result
)->initLatin1Cat();
51 return (UBreakIterator
*)result
;
54 U_CAPI UBreakIterator
* U_EXPORT2
55 urbtok_openBinaryRules(const uint8_t *rules
,
58 if (status
== NULL
|| U_FAILURE(*status
)){
61 uint32_t length
= ((const RBBIDataHeader
*)rules
)->fLength
;
62 uint8_t *ruleCopy
= (uint8_t *) uprv_malloc(length
);
65 *status
= U_MEMORY_ALLOCATION_ERROR
;
68 // Copy the rules so they can be adopted by the tokenizer
69 uprv_memcpy(ruleCopy
, rules
, length
);
70 // The following intended-to-be-private constructor does adopt the rules.
71 BreakIterator
*result
= new RuleBasedBreakIterator((RBBIDataHeader
*)ruleCopy
, *status
);
72 if(U_FAILURE(*status
)) {
75 ((RuleBasedBreakIterator
*)result
)->initLatin1Cat();
76 return (UBreakIterator
*)result
;
79 U_CAPI UBreakIterator
* U_EXPORT2
80 urbtok_openBinaryRulesNoCopy(const uint8_t *rules
,
83 if (status
== NULL
|| U_FAILURE(*status
)){
86 uint32_t length
= ((const RBBIDataHeader
*)rules
)->fLength
;
88 // The following public constructor does not adopt the rules
89 BreakIterator
*result
= new RuleBasedBreakIterator(rules
, length
, *status
);
90 if(U_FAILURE(*status
)) {
93 ((RuleBasedBreakIterator
*)result
)->initLatin1Cat();
94 return (UBreakIterator
*)result
;
97 U_CAPI
uint32_t U_EXPORT2
98 urbtok_getBinaryRules(UBreakIterator
*bi
,
103 if (status
== NULL
|| U_FAILURE(*status
)){
106 if (buffer
== NULL
&& buffSize
> 0) {
107 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
110 RuleBasedBreakIterator
*rbbi
;
111 if ((rbbi
= dynamic_cast<RuleBasedBreakIterator
*>(reinterpret_cast<BreakIterator
*>(bi
))) == NULL
) {
112 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
116 const uint8_t *rules
= rbbi
->getBinaryRules(length
);
119 if (length
> buffSize
) {
120 *status
= U_BUFFER_OVERFLOW_ERROR
;
123 uprv_memcpy(buffer
, rules
, length
);
129 U_CAPI
int32_t U_EXPORT2
130 urbtok_tokenize(UBreakIterator
*bi
,
132 RuleBasedTokenRange
*outTokens
,
133 unsigned long *outTokenFlags
)
135 // Using dynamic_cast/reinterpret_cast has a significant performance impact
136 // on calls to urbtok_tokenize with maxTokens=1, use a regular cast instead
137 //RuleBasedBreakIterator *rbbi;
138 //if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) {
141 //return rbbi->tokenize(maxTokens, outTokens, outTokenFlags);
142 if (bi
== NULL
|| outTokens
== NULL
) {
145 return ((RuleBasedBreakIterator
*)bi
)->tokenize(maxTokens
, outTokens
, outTokenFlags
);
148 U_CAPI
void U_EXPORT2
149 urbtok_swapBinaryRules(const uint8_t *rules
,
152 UBool outIsBigEndian
,
155 DataHeader
*outH
= NULL
;
156 uint32_t outLength
= 0;
157 UDataSwapper
*ds
= udata_openSwapper(inIsBigEndian
, U_CHARSET_FAMILY
, outIsBigEndian
, U_CHARSET_FAMILY
, status
);
159 if (status
== NULL
|| U_FAILURE(*status
)){
163 uint32_t length
= ds
->readUInt32(((const RBBIDataHeader
*)rules
)->fLength
);
164 uint32_t totalLength
= sizeof(DataHeader
) + length
;
166 DataHeader
*dh
= (DataHeader
*)uprv_malloc(totalLength
);
169 *status
= U_MEMORY_ALLOCATION_ERROR
;
172 outH
= (DataHeader
*)uprv_malloc(totalLength
);
175 *status
= U_MEMORY_ALLOCATION_ERROR
;
179 dh
->dataHeader
.headerSize
= ds
->readUInt16(sizeof(DataHeader
));
180 dh
->dataHeader
.magic1
= 0xda;
181 dh
->dataHeader
.magic2
= 0x27;
182 dh
->info
.size
= ds
->readUInt16(sizeof(UDataInfo
));
183 dh
->info
.reservedWord
= 0;
184 dh
->info
.isBigEndian
= inIsBigEndian
;
185 dh
->info
.charsetFamily
= U_CHARSET_FAMILY
;
186 dh
->info
.sizeofUChar
= U_SIZEOF_UCHAR
;
187 dh
->info
.reservedByte
= 0;
188 uprv_memcpy(dh
->info
.dataFormat
, "Brk ", sizeof(dh
->info
.dataFormat
));
189 uprv_memcpy(dh
->info
.formatVersion
, ((const RBBIDataHeader
*)rules
)->fFormatVersion
, sizeof(dh
->info
.formatVersion
));
190 dh
->info
.dataVersion
[0] = 4; // Unicode version
191 dh
->info
.dataVersion
[1] = 1;
192 dh
->info
.dataVersion
[2] = 0;
193 dh
->info
.dataVersion
[3] = 0;
194 uprv_memcpy(((uint8_t*)dh
) + sizeof(DataHeader
), rules
, length
);
196 outLength
= (uint32_t)ubrk_swap(ds
, dh
, totalLength
, outH
, status
);
197 if (U_SUCCESS(*status
) && outLength
!= totalLength
) // something went horribly wrong
199 *status
= U_INVALID_FORMAT_ERROR
;
202 if (U_SUCCESS(*status
))
204 uprv_memcpy(buffer
, ((uint8_t *)outH
) + sizeof(DataHeader
), length
);
210 udata_closeSwapper(ds
);
213 U_CAPI UBreakIterator
* U_EXPORT2
214 urbtok57_openRules(const UChar
*rules
,
216 UParseError
*parseErr
,
219 if (status
== NULL
|| U_FAILURE(*status
)){
223 BreakIterator
*result
= 0;
224 UnicodeString
ruleString(rules
, rulesLength
);
225 result
= new RuleBasedTokenizer(ruleString
, *parseErr
, *status
);
226 if(U_FAILURE(*status
)) {
230 UBreakIterator
*uBI
= (UBreakIterator
*)result
;
234 U_CAPI UBreakIterator
* U_EXPORT2
235 urbtok57_openBinaryRules(const uint8_t *rules
,
238 if (status
== NULL
|| U_FAILURE(*status
)){
242 uint32_t length
= ((const RBBIDataHeader57
*)rules
)->fLength
;
243 uint8_t *ruleCopy
= (uint8_t *) uprv_malloc(length
);
246 *status
= U_MEMORY_ALLOCATION_ERROR
;
249 // Copy the rules so they can be adopted by the tokenizer
250 uprv_memcpy(ruleCopy
, rules
, length
);
251 BreakIterator
*result
= 0;
252 result
= new RuleBasedTokenizer(ruleCopy
, *status
);
253 if(U_FAILURE(*status
)) {
257 UBreakIterator
*uBI
= (UBreakIterator
*)result
;
261 U_CAPI UBreakIterator
* U_EXPORT2
262 urbtok57_openBinaryRulesNoCopy(const uint8_t *rules
,
265 if (status
== NULL
|| U_FAILURE(*status
)){
269 BreakIterator
*result
= 0;
270 result
= new RuleBasedTokenizer(rules
, RuleBasedTokenizer::kDontAdopt
, *status
);
271 if(U_FAILURE(*status
)) {
275 UBreakIterator
*uBI
= (UBreakIterator
*)result
;
279 U_CAPI
uint32_t U_EXPORT2
280 urbtok57_getBinaryRules(UBreakIterator
*bi
,
285 if (status
== NULL
|| U_FAILURE(*status
)){
288 if (buffer
== NULL
&& buffSize
> 0) {
289 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
292 RuleBasedBreakIterator57
*rbbi57
;
293 if ((rbbi57
= dynamic_cast<RuleBasedBreakIterator57
*>(reinterpret_cast<BreakIterator
*>(bi
))) == NULL
) {
294 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
298 const uint8_t *rules
= rbbi57
->getBinaryRules(length
);
301 if (length
> buffSize
) {
302 *status
= U_BUFFER_OVERFLOW_ERROR
;
305 uprv_memcpy(buffer
, rules
, length
);
311 U_CAPI
int32_t U_EXPORT2
312 urbtok57_tokenize(UBreakIterator
*bi
,
314 RuleBasedTokenRange
*outTokens
,
315 unsigned long *outTokenFlags
)
317 return ((RuleBasedTokenizer
*)bi
)->tokenize(maxTokens
, outTokens
, outTokenFlags
);
320 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */