]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/urbtok.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / urbtok.cpp
1 /*
2 *****************************************************************************************
3 * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.
4 *****************************************************************************************
5 */
6
7 #include "unicode/utypes.h"
8
9 #if !UCONFIG_NO_BREAK_ITERATION
10
11 #include "unicode/urbtok.h"
12
13 #include "rbtok.h"
14 #include "unicode/ustring.h"
15 #include "rbbidata.h"
16 #include "cmemory.h"
17 #include "ucmndata.h"
18
19 U_NAMESPACE_USE
20
21 U_CAPI UBreakIterator* U_EXPORT2
22 urbtok_openRules(const UChar *rules,
23 int32_t rulesLength,
24 UParseError *parseErr,
25 UErrorCode *status)
26 {
27 if (status == NULL || U_FAILURE(*status)){
28 return 0;
29 }
30
31 BreakIterator *result = 0;
32 UnicodeString ruleString(rules, rulesLength);
33 result = new RuleBasedTokenizer(ruleString, *parseErr, *status);
34 if(U_FAILURE(*status)) {
35 return 0;
36 }
37
38 UBreakIterator *uBI = (UBreakIterator *)result;
39 return uBI;
40 }
41
42 U_CAPI UBreakIterator* U_EXPORT2
43 urbtok_openBinaryRules(const uint8_t *rules,
44 UErrorCode *status)
45 {
46 if (status == NULL || U_FAILURE(*status)){
47 return 0;
48 }
49
50 uint32_t length = ((const RBBIDataHeader *)rules)->fLength;
51 uint8_t *ruleCopy = (uint8_t *) uprv_malloc(length);
52 if (ruleCopy == 0)
53 {
54 *status = U_MEMORY_ALLOCATION_ERROR;
55 return 0;
56 }
57 // Copy the rules so they can be adopted by the tokenizer
58 uprv_memcpy(ruleCopy, rules, length);
59 BreakIterator *result = 0;
60 result = new RuleBasedTokenizer(ruleCopy, *status);
61 if(U_FAILURE(*status)) {
62 return 0;
63 }
64
65 UBreakIterator *uBI = (UBreakIterator *)result;
66 return uBI;
67 }
68
69 U_CAPI UBreakIterator* U_EXPORT2
70 urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
71 UErrorCode *status)
72 {
73 if (status == NULL || U_FAILURE(*status)){
74 return 0;
75 }
76
77 BreakIterator *result = 0;
78 result = new RuleBasedTokenizer(rules, RuleBasedTokenizer::kDontAdopt, *status);
79 if(U_FAILURE(*status)) {
80 return 0;
81 }
82
83 UBreakIterator *uBI = (UBreakIterator *)result;
84 return uBI;
85 }
86
87 U_CAPI uint32_t U_EXPORT2
88 urbtok_getBinaryRules(UBreakIterator *bi,
89 uint8_t *buffer,
90 uint32_t buffSize,
91 UErrorCode *status)
92 {
93 if (status == NULL || U_FAILURE(*status)){
94 return 0;
95 }
96
97 uint32_t length;
98 const uint8_t *rules = ((RuleBasedBreakIterator *)bi)->getBinaryRules(length);
99 if (buffer != 0)
100 {
101 if (length > buffSize)
102 {
103 *status = U_BUFFER_OVERFLOW_ERROR;
104 }
105 else
106 {
107 uprv_memcpy(buffer, rules, length);
108 }
109 }
110 return length;
111 }
112
113 U_CAPI int32_t U_EXPORT2
114 urbtok_tokenize(UBreakIterator *bi,
115 int32_t maxTokens,
116 RuleBasedTokenRange *outTokens,
117 unsigned long *outTokenFlags)
118 {
119 return ((RuleBasedTokenizer *)bi)->tokenize(maxTokens, outTokens, outTokenFlags);
120 }
121
122 U_CAPI void U_EXPORT2
123 urbtok_swapBinaryRules(const uint8_t *rules,
124 uint8_t *buffer,
125 UBool inIsBigEndian,
126 UBool outIsBigEndian,
127 UErrorCode *status)
128 {
129 DataHeader *outH = NULL;
130 int32_t outLength = 0;
131 UDataSwapper *ds = udata_openSwapper(inIsBigEndian, U_CHARSET_FAMILY, outIsBigEndian, U_CHARSET_FAMILY, status);
132
133 if (status == NULL || U_FAILURE(*status)){
134 return;
135 }
136
137 uint32_t length = ds->readUInt32(((const RBBIDataHeader *)rules)->fLength);
138 uint32_t totalLength = sizeof(DataHeader) + length;
139
140 DataHeader *dh = (DataHeader *)uprv_malloc(totalLength);
141 if (dh == 0)
142 {
143 *status = U_MEMORY_ALLOCATION_ERROR;
144 goto closeSwapper;
145 }
146 outH = (DataHeader *)uprv_malloc(totalLength);
147 if (outH == 0)
148 {
149 *status = U_MEMORY_ALLOCATION_ERROR;
150 uprv_free(dh);
151 goto closeSwapper;
152 }
153 dh->dataHeader.headerSize = ds->readUInt16(sizeof(DataHeader));
154 dh->dataHeader.magic1 = 0xda;
155 dh->dataHeader.magic2 = 0x27;
156 dh->info.size = ds->readUInt16(sizeof(UDataInfo));
157 dh->info.reservedWord = 0;
158 dh->info.isBigEndian = inIsBigEndian;
159 dh->info.charsetFamily = U_CHARSET_FAMILY;
160 dh->info.sizeofUChar = U_SIZEOF_UCHAR;
161 dh->info.reservedByte = 0;
162 uprv_memcpy(dh->info.dataFormat, "Brk ", sizeof(dh->info.dataFormat));
163 uprv_memcpy(dh->info.formatVersion, ((const RBBIDataHeader *)rules)->fFormatVersion, sizeof(dh->info.formatVersion));
164 dh->info.dataVersion[0] = 4; // Unicode version
165 dh->info.dataVersion[1] = 1;
166 dh->info.dataVersion[2] = 0;
167 dh->info.dataVersion[3] = 0;
168 uprv_memcpy(((uint8_t*)dh) + sizeof(DataHeader), rules, length);
169
170 outLength = ubrk_swap(ds, dh, totalLength, outH, status);
171 if (U_SUCCESS(*status) && outLength != totalLength) // something went horribly wrong
172 {
173 *status = U_INVALID_FORMAT_ERROR;
174 }
175
176 if (U_SUCCESS(*status))
177 {
178 uprv_memcpy(buffer, ((uint8_t *)outH) + sizeof(DataHeader), length);
179 }
180 uprv_free(outH);
181 uprv_free(dh);
182
183 closeSwapper:
184 udata_closeSwapper(ds);
185 }
186
187
188 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */