]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unistr_cnv.cpp
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / common / unistr_cnv.cpp
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
b331163b 4* Copyright (C) 1999-2014, International Business Machines
374ca955
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: unistr_cnv.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:2
12*
13* created on: 2004aug19
14* created by: Markus W. Scherer
15*
16* Character conversion functions moved here from unistr.cpp
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_CONVERSION
22
23#include "unicode/putil.h"
24#include "cstring.h"
25#include "cmemory.h"
26#include "unicode/ustring.h"
27#include "unicode/unistr.h"
28#include "unicode/ucnv.h"
729e4ab9 29#include "ucnv_imp.h"
374ca955
A
30#include "putilimp.h"
31#include "ustr_cnv.h"
32#include "ustr_imp.h"
33
34U_NAMESPACE_BEGIN
35
36//========================================
37// Constructors
38//========================================
39
729e4ab9
A
40#if !U_CHARSET_IS_UTF8
41
b331163b
A
42UnicodeString::UnicodeString(const char *codepageData) {
43 fUnion.fFields.fLengthAndFlags = kShortString;
729e4ab9
A
44 if(codepageData != 0) {
45 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
46 }
47}
48
49UnicodeString::UnicodeString(const char *codepageData,
b331163b
A
50 int32_t dataLength) {
51 fUnion.fFields.fLengthAndFlags = kShortString;
729e4ab9
A
52 if(codepageData != 0) {
53 doCodepageCreate(codepageData, dataLength, 0);
54 }
55}
56
57// else see unistr.cpp
58#endif
59
374ca955 60UnicodeString::UnicodeString(const char *codepageData,
b331163b
A
61 const char *codepage) {
62 fUnion.fFields.fLengthAndFlags = kShortString;
73c04bcf
A
63 if(codepageData != 0) {
64 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
65 }
374ca955
A
66}
67
374ca955
A
68UnicodeString::UnicodeString(const char *codepageData,
69 int32_t dataLength,
b331163b
A
70 const char *codepage) {
71 fUnion.fFields.fLengthAndFlags = kShortString;
73c04bcf
A
72 if(codepageData != 0) {
73 doCodepageCreate(codepageData, dataLength, codepage);
74 }
374ca955
A
75}
76
77UnicodeString::UnicodeString(const char *src, int32_t srcLength,
78 UConverter *cnv,
b331163b
A
79 UErrorCode &errorCode) {
80 fUnion.fFields.fLengthAndFlags = kShortString;
73c04bcf
A
81 if(U_SUCCESS(errorCode)) {
82 // check arguments
83 if(src==NULL) {
84 // treat as an empty string, do nothing more
85 } else if(srcLength<-1) {
86 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
374ca955 87 } else {
73c04bcf
A
88 // get input length
89 if(srcLength==-1) {
90 srcLength=(int32_t)uprv_strlen(src);
91 }
92 if(srcLength>0) {
93 if(cnv!=0) {
94 // use the provided converter
95 ucnv_resetToUnicode(cnv);
96 doCodepageCreate(src, srcLength, cnv, errorCode);
97 } else {
98 // use the default converter
99 cnv=u_getDefaultConverter(&errorCode);
100 doCodepageCreate(src, srcLength, cnv, errorCode);
101 u_releaseDefaultConverter(cnv);
102 }
103 }
374ca955 104 }
374ca955 105
73c04bcf
A
106 if(U_FAILURE(errorCode)) {
107 setToBogus();
108 }
374ca955 109 }
374ca955
A
110}
111
112//========================================
113// Codeset conversion
114//========================================
729e4ab9
A
115
116#if !U_CHARSET_IS_UTF8
117
118int32_t
119UnicodeString::extract(int32_t start,
120 int32_t length,
121 char *target,
122 uint32_t dstSize) const {
123 return extract(start, length, target, dstSize, 0);
124}
125
126// else see unistr.cpp
127#endif
128
374ca955
A
129int32_t
130UnicodeString::extract(int32_t start,
131 int32_t length,
132 char *target,
133 uint32_t dstSize,
134 const char *codepage) const
135{
73c04bcf
A
136 // if the arguments are illegal, then do nothing
137 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
138 return 0;
139 }
140
141 // pin the indices to legal values
142 pinIndices(start, length);
143
729e4ab9
A
144 // We need to cast dstSize to int32_t for all subsequent code.
145 // I don't know why the API was defined with uint32_t but we are stuck with it.
146 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
147 // as a limit in some functions, it may wrap around and yield a pointer
148 // that compares less-than target.
149 int32_t capacity;
150 if(dstSize < 0x7fffffff) {
151 // Assume that the capacity is real and a limit pointer won't wrap around.
152 capacity = (int32_t)dstSize;
153 } else {
154 // Pin the capacity so that a limit pointer does not wrap around.
155 char *targetLimit = (char *)U_MAX_PTR(target);
156 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
157 // greater than target and does not wrap around the top of the address space.
158 capacity = (int32_t)(targetLimit - target);
159 }
160
73c04bcf
A
161 // create the converter
162 UConverter *converter;
163 UErrorCode status = U_ZERO_ERROR;
164
165 // just write the NUL if the string length is 0
166 if(length == 0) {
729e4ab9 167 return u_terminateChars(target, capacity, 0, &status);
73c04bcf
A
168 }
169
170 // if the codepage is the default, use our cache
171 // if it is an empty string, then use the "invariant character" conversion
172 if (codepage == 0) {
729e4ab9
A
173 const char *defaultName = ucnv_getDefaultName();
174 if(UCNV_FAST_IS_UTF8(defaultName)) {
175 return toUTF8(start, length, target, capacity);
176 }
73c04bcf
A
177 converter = u_getDefaultConverter(&status);
178 } else if (*codepage == 0) {
179 // use the "invariant characters" conversion
180 int32_t destLength;
729e4ab9 181 if(length <= capacity) {
73c04bcf
A
182 destLength = length;
183 } else {
729e4ab9 184 destLength = capacity;
73c04bcf
A
185 }
186 u_UCharsToChars(getArrayStart() + start, target, destLength);
729e4ab9 187 return u_terminateChars(target, capacity, length, &status);
374ca955 188 } else {
73c04bcf 189 converter = ucnv_open(codepage, &status);
374ca955 190 }
73c04bcf 191
729e4ab9 192 length = doExtract(start, length, target, capacity, converter, status);
73c04bcf
A
193
194 // close the converter
195 if (codepage == 0) {
196 u_releaseDefaultConverter(converter);
197 } else {
198 ucnv_close(converter);
199 }
200
201 return length;
374ca955
A
202}
203
204int32_t
205UnicodeString::extract(char *dest, int32_t destCapacity,
206 UConverter *cnv,
73c04bcf
A
207 UErrorCode &errorCode) const
208{
374ca955 209 if(U_FAILURE(errorCode)) {
73c04bcf 210 return 0;
374ca955 211 }
374ca955 212
73c04bcf
A
213 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
214 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
215 return 0;
216 }
374ca955 217
73c04bcf 218 // nothing to do?
46f4442e 219 if(isEmpty()) {
73c04bcf
A
220 return u_terminateChars(dest, destCapacity, 0, &errorCode);
221 }
374ca955 222
73c04bcf
A
223 // get the converter
224 UBool isDefaultConverter;
225 if(cnv==0) {
226 isDefaultConverter=TRUE;
227 cnv=u_getDefaultConverter(&errorCode);
228 if(U_FAILURE(errorCode)) {
229 return 0;
230 }
231 } else {
232 isDefaultConverter=FALSE;
233 ucnv_resetFromUnicode(cnv);
234 }
235
236 // convert
46f4442e 237 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
73c04bcf
A
238
239 // release the converter
240 if(isDefaultConverter) {
241 u_releaseDefaultConverter(cnv);
242 }
243
46f4442e 244 return len;
374ca955
A
245}
246
247int32_t
248UnicodeString::doExtract(int32_t start, int32_t length,
249 char *dest, int32_t destCapacity,
250 UConverter *cnv,
73c04bcf
A
251 UErrorCode &errorCode) const
252{
253 if(U_FAILURE(errorCode)) {
254 if(destCapacity!=0) {
255 *dest=0;
256 }
257 return 0;
258 }
259
46f4442e 260 const UChar *src=getArrayStart()+start, *srcLimit=src+length;
73c04bcf
A
261 char *originalDest=dest;
262 const char *destLimit;
263
264 if(destCapacity==0) {
265 destLimit=dest=0;
266 } else if(destCapacity==-1) {
267 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
268 destLimit=(char*)U_MAX_PTR(dest);
269 // for NUL-termination, translate into highest int32_t
270 destCapacity=0x7fffffff;
271 } else {
272 destLimit=dest+destCapacity;
273 }
274
275 // perform the conversion
276 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
277 length=(int32_t)(dest-originalDest);
278
279 // if an overflow occurs, then get the preflighting length
280 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
281 char buffer[1024];
282
283 destLimit=buffer+sizeof(buffer);
284 do {
285 dest=buffer;
286 errorCode=U_ZERO_ERROR;
287 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
288 length+=(int32_t)(dest-buffer);
289 } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
374ca955 290 }
73c04bcf
A
291
292 return u_terminateChars(originalDest, destCapacity, length, &errorCode);
374ca955
A
293}
294
295void
296UnicodeString::doCodepageCreate(const char *codepageData,
73c04bcf
A
297 int32_t dataLength,
298 const char *codepage)
374ca955 299{
73c04bcf
A
300 // if there's nothing to convert, do nothing
301 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
302 return;
303 }
304 if(dataLength == -1) {
305 dataLength = (int32_t)uprv_strlen(codepageData);
306 }
307
308 UErrorCode status = U_ZERO_ERROR;
309
310 // create the converter
311 // if the codepage is the default, use our cache
312 // if it is an empty string, then use the "invariant character" conversion
729e4ab9
A
313 UConverter *converter;
314 if (codepage == 0) {
315 const char *defaultName = ucnv_getDefaultName();
316 if(UCNV_FAST_IS_UTF8(defaultName)) {
317 setToUTF8(StringPiece(codepageData, dataLength));
318 return;
319 }
320 converter = u_getDefaultConverter(&status);
321 } else if(*codepage == 0) {
73c04bcf
A
322 // use the "invariant characters" conversion
323 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
324 u_charsToUChars(codepageData, getArrayStart(), dataLength);
46f4442e 325 setLength(dataLength);
73c04bcf
A
326 } else {
327 setToBogus();
328 }
329 return;
729e4ab9
A
330 } else {
331 converter = ucnv_open(codepage, &status);
73c04bcf
A
332 }
333
729e4ab9
A
334 // if we failed, set the appropriate flags and return
335 if(U_FAILURE(status)) {
336 setToBogus();
337 return;
338 }
339
340 // perform the conversion
73c04bcf
A
341 doCodepageCreate(codepageData, dataLength, converter, status);
342 if(U_FAILURE(status)) {
343 setToBogus();
344 }
345
346 // close the converter
347 if(codepage == 0) {
348 u_releaseDefaultConverter(converter);
374ca955 349 } else {
73c04bcf 350 ucnv_close(converter);
374ca955 351 }
374ca955
A
352}
353
354void
355UnicodeString::doCodepageCreate(const char *codepageData,
356 int32_t dataLength,
357 UConverter *converter,
73c04bcf
A
358 UErrorCode &status)
359{
360 if(U_FAILURE(status)) {
361 return;
374ca955
A
362 }
363
73c04bcf
A
364 // set up the conversion parameters
365 const char *mySource = codepageData;
366 const char *mySourceEnd = mySource + dataLength;
46f4442e 367 UChar *array, *myTarget;
73c04bcf
A
368
369 // estimate the size needed:
46f4442e
A
370 int32_t arraySize;
371 if(dataLength <= US_STACKBUF_SIZE) {
372 // try to use the stack buffer
373 arraySize = US_STACKBUF_SIZE;
374 } else {
375 // 1.25 UChar's per source byte should cover most cases
376 arraySize = dataLength + (dataLength >> 2);
377 }
73c04bcf
A
378
379 // we do not care about the current contents
380 UBool doCopyArray = FALSE;
381 for(;;) {
382 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
383 setToBogus();
384 break;
385 }
374ca955 386
73c04bcf 387 // perform the conversion
46f4442e
A
388 array = getArrayStart();
389 myTarget = array + length();
390 ucnv_toUnicode(converter, &myTarget, array + getCapacity(),
73c04bcf 391 &mySource, mySourceEnd, 0, TRUE, &status);
374ca955 392
73c04bcf 393 // update the conversion parameters
46f4442e 394 setLength((int32_t)(myTarget - array));
374ca955 395
73c04bcf
A
396 // allocate more space and copy data, if needed
397 if(status == U_BUFFER_OVERFLOW_ERROR) {
398 // reset the error code
399 status = U_ZERO_ERROR;
374ca955 400
73c04bcf
A
401 // keep the previous conversion results
402 doCopyArray = TRUE;
403
404 // estimate the new size needed, larger than before
405 // try 2 UChar's per remaining source byte
46f4442e 406 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
73c04bcf
A
407 } else {
408 break;
409 }
374ca955 410 }
374ca955
A
411}
412
413U_NAMESPACE_END
414
415#endif