]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unistr_cnv.cpp
ICU-6.2.8.tar.gz
[apple/icu.git] / icuSources / common / unistr_cnv.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: unistr_cnv.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:2
12 *
13 * created on: 2004aug19
14 * created by: Markus W. Scherer
15 *
16 * Character conversion functions moved here from unistr.cpp
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_CONVERSION
22
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/ucnv.h"
29 #include "putilimp.h"
30 #include "ustr_cnv.h"
31 #include "ustr_imp.h"
32
33 U_NAMESPACE_BEGIN
34
35 //========================================
36 // Constructors
37 //========================================
38
39 UnicodeString::UnicodeString(const char *codepageData,
40 const char *codepage)
41 : fLength(0),
42 fCapacity(US_STACKBUF_SIZE),
43 fArray(fStackBuffer),
44 fFlags(kShortString)
45 {
46 if(codepageData != 0) {
47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
48 }
49 }
50
51
52 UnicodeString::UnicodeString(const char *codepageData,
53 int32_t dataLength,
54 const char *codepage)
55 : fLength(0),
56 fCapacity(US_STACKBUF_SIZE),
57 fArray(fStackBuffer),
58 fFlags(kShortString)
59 {
60 if(codepageData != 0) {
61 doCodepageCreate(codepageData, dataLength, codepage);
62 }
63 }
64
65 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
66 UConverter *cnv,
67 UErrorCode &errorCode)
68 : fLength(0),
69 fCapacity(US_STACKBUF_SIZE),
70 fArray(fStackBuffer),
71 fFlags(kShortString)
72 {
73 if(U_SUCCESS(errorCode)) {
74 // check arguments
75 if(src==NULL) {
76 // treat as an empty string, do nothing more
77 } else if(srcLength<-1) {
78 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
79 } else {
80 // get input length
81 if(srcLength==-1) {
82 srcLength=(int32_t)uprv_strlen(src);
83 }
84 if(srcLength>0) {
85 if(cnv!=0) {
86 // use the provided converter
87 ucnv_resetToUnicode(cnv);
88 doCodepageCreate(src, srcLength, cnv, errorCode);
89 } else {
90 // use the default converter
91 cnv=u_getDefaultConverter(&errorCode);
92 doCodepageCreate(src, srcLength, cnv, errorCode);
93 u_releaseDefaultConverter(cnv);
94 }
95 }
96 }
97
98 if(U_FAILURE(errorCode)) {
99 setToBogus();
100 }
101 }
102 }
103
104 //========================================
105 // Codeset conversion
106 //========================================
107 int32_t
108 UnicodeString::extract(int32_t start,
109 int32_t length,
110 char *target,
111 uint32_t dstSize,
112 const char *codepage) const
113 {
114 // if the arguments are illegal, then do nothing
115 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
116 return 0;
117 }
118
119 // pin the indices to legal values
120 pinIndices(start, length);
121
122 // create the converter
123 UConverter *converter;
124 UErrorCode status = U_ZERO_ERROR;
125
126 // just write the NUL if the string length is 0
127 if(length == 0) {
128 if(dstSize >= 0x80000000) {
129 // careful: dstSize is unsigned! (0xffffffff means "unlimited")
130 // make sure that the NUL-termination works (takes int32_t)
131 dstSize=0x7fffffff;
132 }
133 return u_terminateChars(target, dstSize, 0, &status);
134 }
135
136 // if the codepage is the default, use our cache
137 // if it is an empty string, then use the "invariant character" conversion
138 if (codepage == 0) {
139 converter = u_getDefaultConverter(&status);
140 } else if (*codepage == 0) {
141 // use the "invariant characters" conversion
142 int32_t destLength;
143 // careful: dstSize is unsigned! (0xffffffff means "unlimited")
144 if(dstSize >= 0x80000000) {
145 destLength = length;
146 // make sure that the NUL-termination works (takes int32_t)
147 dstSize=0x7fffffff;
148 } else if(length <= (int32_t)dstSize) {
149 destLength = length;
150 } else {
151 destLength = (int32_t)dstSize;
152 }
153 u_UCharsToChars(getArrayStart() + start, target, destLength);
154 return u_terminateChars(target, (int32_t)dstSize, length, &status);
155 } else {
156 converter = ucnv_open(codepage, &status);
157 }
158
159 length = doExtract(start, length, target, (int32_t)dstSize, converter, status);
160
161 // close the converter
162 if (codepage == 0) {
163 u_releaseDefaultConverter(converter);
164 } else {
165 ucnv_close(converter);
166 }
167
168 return length;
169 }
170
171 int32_t
172 UnicodeString::extract(char *dest, int32_t destCapacity,
173 UConverter *cnv,
174 UErrorCode &errorCode) const {
175 if(U_FAILURE(errorCode)) {
176 return 0;
177 }
178
179 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
180 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
181 return 0;
182 }
183
184 // nothing to do?
185 if(fLength<=0) {
186 return u_terminateChars(dest, destCapacity, 0, &errorCode);
187 }
188
189 // get the converter
190 UBool isDefaultConverter;
191 if(cnv==0) {
192 isDefaultConverter=TRUE;
193 cnv=u_getDefaultConverter(&errorCode);
194 if(U_FAILURE(errorCode)) {
195 return 0;
196 }
197 } else {
198 isDefaultConverter=FALSE;
199 ucnv_resetFromUnicode(cnv);
200 }
201
202 // convert
203 int32_t length=doExtract(0, fLength, dest, destCapacity, cnv, errorCode);
204
205 // release the converter
206 if(isDefaultConverter) {
207 u_releaseDefaultConverter(cnv);
208 }
209
210 return length;
211 }
212
213 int32_t
214 UnicodeString::doExtract(int32_t start, int32_t length,
215 char *dest, int32_t destCapacity,
216 UConverter *cnv,
217 UErrorCode &errorCode) const {
218 if(U_FAILURE(errorCode)) {
219 if(destCapacity!=0) {
220 *dest=0;
221 }
222 return 0;
223 }
224
225 const UChar *src=fArray+start, *srcLimit=src+length;
226 char *originalDest=dest;
227 const char *destLimit;
228
229 if(destCapacity==0) {
230 destLimit=dest=0;
231 } else if(destCapacity==-1) {
232 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
233 destLimit=(char*)U_MAX_PTR(dest);
234 // for NUL-termination, translate into highest int32_t
235 destCapacity=0x7fffffff;
236 } else {
237 destLimit=dest+destCapacity;
238 }
239
240 // perform the conversion
241 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
242 length=(int32_t)(dest-originalDest);
243
244 // if an overflow occurs, then get the preflighting length
245 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
246 char buffer[1024];
247
248 destLimit=buffer+sizeof(buffer);
249 do {
250 dest=buffer;
251 errorCode=U_ZERO_ERROR;
252 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
253 length+=(int32_t)(dest-buffer);
254 } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
255 }
256
257 return u_terminateChars(originalDest, destCapacity, length, &errorCode);
258 }
259
260 void
261 UnicodeString::doCodepageCreate(const char *codepageData,
262 int32_t dataLength,
263 const char *codepage)
264 {
265 // if there's nothing to convert, do nothing
266 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
267 return;
268 }
269 if(dataLength == -1) {
270 dataLength = uprv_strlen(codepageData);
271 }
272
273 UErrorCode status = U_ZERO_ERROR;
274
275 // create the converter
276 // if the codepage is the default, use our cache
277 // if it is an empty string, then use the "invariant character" conversion
278 UConverter *converter = (codepage == 0 ?
279 u_getDefaultConverter(&status) :
280 *codepage == 0 ?
281 0 :
282 ucnv_open(codepage, &status));
283
284 // if we failed, set the appropriate flags and return
285 if(U_FAILURE(status)) {
286 setToBogus();
287 return;
288 }
289
290 // perform the conversion
291 if(converter == 0) {
292 // use the "invariant characters" conversion
293 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
294 u_charsToUChars(codepageData, getArrayStart(), dataLength);
295 fLength = dataLength;
296 } else {
297 setToBogus();
298 }
299 return;
300 }
301
302 // convert using the real converter
303 doCodepageCreate(codepageData, dataLength, converter, status);
304 if(U_FAILURE(status)) {
305 setToBogus();
306 }
307
308 // close the converter
309 if(codepage == 0) {
310 u_releaseDefaultConverter(converter);
311 } else {
312 ucnv_close(converter);
313 }
314 }
315
316 void
317 UnicodeString::doCodepageCreate(const char *codepageData,
318 int32_t dataLength,
319 UConverter *converter,
320 UErrorCode &status) {
321 if(U_FAILURE(status)) {
322 return;
323 }
324
325 // set up the conversion parameters
326 const char *mySource = codepageData;
327 const char *mySourceEnd = mySource + dataLength;
328 UChar *myTarget;
329
330 // estimate the size needed:
331 // 1.25 UChar's per source byte should cover most cases
332 int32_t arraySize = dataLength + (dataLength >> 2);
333
334 // we do not care about the current contents
335 UBool doCopyArray = FALSE;
336 for(;;) {
337 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
338 setToBogus();
339 break;
340 }
341
342 // perform the conversion
343 myTarget = fArray + fLength;
344 ucnv_toUnicode(converter, &myTarget, fArray + fCapacity,
345 &mySource, mySourceEnd, 0, TRUE, &status);
346
347 // update the conversion parameters
348 fLength = (int32_t)(myTarget - fArray);
349
350 // allocate more space and copy data, if needed
351 if(status == U_BUFFER_OVERFLOW_ERROR) {
352 // reset the error code
353 status = U_ZERO_ERROR;
354
355 // keep the previous conversion results
356 doCopyArray = TRUE;
357
358 // estimate the new size needed, larger than before
359 // try 2 UChar's per remaining source byte
360 arraySize = (int32_t)(fLength + 2 * (mySourceEnd - mySource));
361 } else {
362 break;
363 }
364 }
365 }
366
367 U_NAMESPACE_END
368
369 #endif