2 ***************************************************************************************** 
   3 * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. 
   4 ***************************************************************************************** 
   7 #include "unicode/utypes.h" 
   9 #if !UCONFIG_NO_BREAK_ITERATION 
  11 #include "unicode/urbtok.h" 
  13 #include "unicode/ustring.h" 
  14 #include "unicode/rbbi.h" 
  17 #include "rbbidata57.h" 
  24 U_CAPI UBreakIterator
* U_EXPORT2
 
  25 urbtok_open(UBreakIteratorType type
, 
  29     UBreakIterator
* result 
= ubrk_open(type
, locale
, NULL
, 0, status
); 
  30     if(U_SUCCESS(*status
)) { 
  31         ((RuleBasedBreakIterator
*)result
)->initLatin1Cat(); 
  36 U_CAPI UBreakIterator
* U_EXPORT2
 
  37 urbtok_openRules(const UChar     
*rules
, 
  39                UParseError     
*parseErr
, 
  42     if (status 
== NULL 
|| U_FAILURE(*status
)){ 
  45     UnicodeString 
ruleString(rules
, rulesLength
); 
  46     BreakIterator 
*result 
= RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString
, parseErr
, *status
); 
  47     if(U_FAILURE(*status
)) { 
  50     ((RuleBasedBreakIterator
*)result
)->initLatin1Cat(); 
  51     return (UBreakIterator 
*)result
; 
  54 U_CAPI UBreakIterator
* U_EXPORT2
 
  55 urbtok_openBinaryRules(const uint8_t *rules
, 
  58     if (status 
== NULL 
|| U_FAILURE(*status
)){ 
  61     uint32_t length 
= ((const RBBIDataHeader 
*)rules
)->fLength
; 
  62     uint8_t *ruleCopy 
= (uint8_t *) uprv_malloc(length
); 
  65         *status 
= U_MEMORY_ALLOCATION_ERROR
; 
  68     // Copy the rules so they can be adopted by the tokenizer 
  69     uprv_memcpy(ruleCopy
, rules
, length
); 
  70     // The following intended-to-be-private constructor does adopt the rules. 
  71     BreakIterator 
*result 
= new RuleBasedBreakIterator((RBBIDataHeader 
*)ruleCopy
, *status
); 
  72     if(U_FAILURE(*status
)) { 
  75     ((RuleBasedBreakIterator
*)result
)->initLatin1Cat(); 
  76     return (UBreakIterator 
*)result
; 
  79 U_CAPI UBreakIterator
* U_EXPORT2
 
  80 urbtok_openBinaryRulesNoCopy(const uint8_t *rules
, 
  83     if (status 
== NULL 
|| U_FAILURE(*status
)){ 
  86     uint32_t length 
= ((const RBBIDataHeader 
*)rules
)->fLength
; 
  88     // The following public constructor does not adopt the rules 
  89     BreakIterator 
*result 
= new RuleBasedBreakIterator(rules
, length
, *status
); 
  90     if(U_FAILURE(*status
)) { 
  93     ((RuleBasedBreakIterator
*)result
)->initLatin1Cat(); 
  94     return (UBreakIterator 
*)result
; 
  97 U_CAPI 
uint32_t U_EXPORT2
 
  98 urbtok_getBinaryRules(UBreakIterator      
*bi
, 
 103     if (status 
== NULL 
|| U_FAILURE(*status
)){ 
 106     if (buffer 
== NULL 
&& buffSize 
> 0) { 
 107         *status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 110     RuleBasedBreakIterator 
*rbbi
; 
 111     if ((rbbi 
= dynamic_cast<RuleBasedBreakIterator
*>(reinterpret_cast<BreakIterator
*>(bi
))) == NULL
) { 
 112         *status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 116     const uint8_t *rules 
= rbbi
->getBinaryRules(length
); 
 119         if (length 
> buffSize
) { 
 120             *status 
= U_BUFFER_OVERFLOW_ERROR
; 
 123             uprv_memcpy(buffer
, rules
, length
); 
 129 U_CAPI 
int32_t U_EXPORT2
 
 130 urbtok_tokenize(UBreakIterator      
*bi
, 
 132                RuleBasedTokenRange  
*outTokens
, 
 133                unsigned long        *outTokenFlags
) 
 135     // Using dynamic_cast/reinterpret_cast has a significant performance impact 
 136     // on calls to urbtok_tokenize with maxTokens=1, use a regular cast instead 
 137     //RuleBasedBreakIterator *rbbi; 
 138     //if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) { 
 141     //return rbbi->tokenize(maxTokens, outTokens, outTokenFlags); 
 142     if (bi 
== NULL 
|| outTokens 
== NULL
) { 
 145     return ((RuleBasedBreakIterator
*)bi
)->tokenize(maxTokens
, outTokens
, outTokenFlags
); 
 148 U_CAPI 
void U_EXPORT2
 
 149 urbtok_swapBinaryRules(const uint8_t *rules
, 
 152                UBool            outIsBigEndian
, 
 155     DataHeader 
*outH 
= NULL
; 
 156     uint32_t outLength 
= 0; 
 157     UDataSwapper 
*ds 
= udata_openSwapper(inIsBigEndian
, U_CHARSET_FAMILY
, outIsBigEndian
, U_CHARSET_FAMILY
, status
); 
 159     if (status 
== NULL 
|| U_FAILURE(*status
)){ 
 163     uint32_t length 
= ds
->readUInt32(((const RBBIDataHeader 
*)rules
)->fLength
); 
 164     uint32_t totalLength 
= sizeof(DataHeader
) + length
; 
 166     DataHeader 
*dh 
= (DataHeader 
*)uprv_malloc(totalLength
); 
 169         *status 
= U_MEMORY_ALLOCATION_ERROR
; 
 172     outH 
= (DataHeader 
*)uprv_malloc(totalLength
); 
 175         *status 
= U_MEMORY_ALLOCATION_ERROR
; 
 179     dh
->dataHeader
.headerSize 
= ds
->readUInt16(sizeof(DataHeader
)); 
 180     dh
->dataHeader
.magic1 
= 0xda; 
 181     dh
->dataHeader
.magic2 
= 0x27; 
 182     dh
->info
.size 
= ds
->readUInt16(sizeof(UDataInfo
)); 
 183     dh
->info
.reservedWord 
= 0; 
 184     dh
->info
.isBigEndian 
= inIsBigEndian
; 
 185     dh
->info
.charsetFamily 
= U_CHARSET_FAMILY
; 
 186     dh
->info
.sizeofUChar 
= U_SIZEOF_UCHAR
; 
 187     dh
->info
.reservedByte 
= 0; 
 188     uprv_memcpy(dh
->info
.dataFormat
, "Brk ", sizeof(dh
->info
.dataFormat
)); 
 189     uprv_memcpy(dh
->info
.formatVersion
, ((const RBBIDataHeader 
*)rules
)->fFormatVersion
, sizeof(dh
->info
.formatVersion
)); 
 190     dh
->info
.dataVersion
[0] = 4;        // Unicode version 
 191     dh
->info
.dataVersion
[1] = 1; 
 192     dh
->info
.dataVersion
[2] = 0; 
 193     dh
->info
.dataVersion
[3] = 0; 
 194     uprv_memcpy(((uint8_t*)dh
) + sizeof(DataHeader
), rules
, length
); 
 196     outLength 
= (uint32_t)ubrk_swap(ds
, dh
, totalLength
, outH
, status
); 
 197     if (U_SUCCESS(*status
) && outLength 
!= totalLength
)   // something went horribly wrong 
 199         *status 
= U_INVALID_FORMAT_ERROR
; 
 202     if (U_SUCCESS(*status
)) 
 204         uprv_memcpy(buffer
, ((uint8_t *)outH
) + sizeof(DataHeader
), length
); 
 210     udata_closeSwapper(ds
); 
 213 U_CAPI UBreakIterator
* U_EXPORT2
 
 214 urbtok57_openRules(const UChar     
*rules
, 
 216                UParseError     
*parseErr
, 
 219     if (status 
== NULL 
|| U_FAILURE(*status
)){ 
 223     BreakIterator 
*result 
= 0; 
 224     UnicodeString 
ruleString(rules
, rulesLength
); 
 225     result 
= new RuleBasedTokenizer(ruleString
, *parseErr
, *status
); 
 226     if(U_FAILURE(*status
)) { 
 230     UBreakIterator 
*uBI 
= (UBreakIterator 
*)result
; 
 234 U_CAPI UBreakIterator
* U_EXPORT2
 
 235 urbtok57_openBinaryRules(const uint8_t *rules
, 
 238     if (status 
== NULL 
|| U_FAILURE(*status
)){ 
 242     uint32_t length 
= ((const RBBIDataHeader57 
*)rules
)->fLength
; 
 243     uint8_t *ruleCopy 
= (uint8_t *) uprv_malloc(length
); 
 246         *status 
= U_MEMORY_ALLOCATION_ERROR
; 
 249     // Copy the rules so they can be adopted by the tokenizer 
 250     uprv_memcpy(ruleCopy
, rules
, length
); 
 251     BreakIterator 
*result 
= 0; 
 252     result 
= new RuleBasedTokenizer(ruleCopy
, *status
); 
 253     if(U_FAILURE(*status
)) { 
 257     UBreakIterator 
*uBI 
= (UBreakIterator 
*)result
; 
 261 U_CAPI UBreakIterator
* U_EXPORT2
 
 262 urbtok57_openBinaryRulesNoCopy(const uint8_t *rules
, 
 265     if (status 
== NULL 
|| U_FAILURE(*status
)){ 
 269     BreakIterator 
*result 
= 0; 
 270     result 
= new RuleBasedTokenizer(rules
, RuleBasedTokenizer::kDontAdopt
, *status
); 
 271     if(U_FAILURE(*status
)) { 
 275     UBreakIterator 
*uBI 
= (UBreakIterator 
*)result
; 
 279 U_CAPI 
uint32_t U_EXPORT2
 
 280 urbtok57_getBinaryRules(UBreakIterator      
*bi
, 
 285     if (status 
== NULL 
|| U_FAILURE(*status
)){ 
 288     if (buffer 
== NULL 
&& buffSize 
> 0) { 
 289         *status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 292     RuleBasedBreakIterator57 
*rbbi57
; 
 293     if ((rbbi57 
= dynamic_cast<RuleBasedBreakIterator57
*>(reinterpret_cast<BreakIterator
*>(bi
))) == NULL
) { 
 294         *status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 298     const uint8_t *rules 
= rbbi57
->getBinaryRules(length
); 
 301         if (length 
> buffSize
) { 
 302             *status 
= U_BUFFER_OVERFLOW_ERROR
; 
 305             uprv_memcpy(buffer
, rules
, length
); 
 311 U_CAPI 
int32_t U_EXPORT2
 
 312 urbtok57_tokenize(UBreakIterator      
*bi
, 
 314                RuleBasedTokenRange  
*outTokens
, 
 315                unsigned long        *outTokenFlags
) 
 317     return ((RuleBasedTokenizer 
*)bi
)->tokenize(maxTokens
, outTokens
, outTokenFlags
); 
 320 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */