]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unistr_cnv.cpp
ICU-461.12.tar.gz
[apple/icu.git] / icuSources / common / unistr_cnv.cpp
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
729e4ab9 4* Copyright (C) 1999-2010, International Business Machines
374ca955
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: unistr_cnv.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:2
12*
13* created on: 2004aug19
14* created by: Markus W. Scherer
15*
16* Character conversion functions moved here from unistr.cpp
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_CONVERSION
22
23#include "unicode/putil.h"
24#include "cstring.h"
25#include "cmemory.h"
26#include "unicode/ustring.h"
27#include "unicode/unistr.h"
28#include "unicode/ucnv.h"
729e4ab9 29#include "ucnv_imp.h"
374ca955
A
30#include "putilimp.h"
31#include "ustr_cnv.h"
32#include "ustr_imp.h"
33
34U_NAMESPACE_BEGIN
35
36//========================================
37// Constructors
38//========================================
39
729e4ab9
A
40#if !U_CHARSET_IS_UTF8
41
42UnicodeString::UnicodeString(const char *codepageData)
43 : fShortLength(0),
44 fFlags(kShortString)
45{
46 if(codepageData != 0) {
47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
48 }
49}
50
51UnicodeString::UnicodeString(const char *codepageData,
52 int32_t dataLength)
53 : fShortLength(0),
54 fFlags(kShortString)
55{
56 if(codepageData != 0) {
57 doCodepageCreate(codepageData, dataLength, 0);
58 }
59}
60
61// else see unistr.cpp
62#endif
63
374ca955
A
64UnicodeString::UnicodeString(const char *codepageData,
65 const char *codepage)
46f4442e 66 : fShortLength(0),
374ca955
A
67 fFlags(kShortString)
68{
73c04bcf
A
69 if(codepageData != 0) {
70 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
71 }
374ca955
A
72}
73
374ca955
A
74UnicodeString::UnicodeString(const char *codepageData,
75 int32_t dataLength,
76 const char *codepage)
46f4442e 77 : fShortLength(0),
374ca955
A
78 fFlags(kShortString)
79{
73c04bcf
A
80 if(codepageData != 0) {
81 doCodepageCreate(codepageData, dataLength, codepage);
82 }
374ca955
A
83}
84
85UnicodeString::UnicodeString(const char *src, int32_t srcLength,
86 UConverter *cnv,
87 UErrorCode &errorCode)
46f4442e 88 : fShortLength(0),
374ca955
A
89 fFlags(kShortString)
90{
73c04bcf
A
91 if(U_SUCCESS(errorCode)) {
92 // check arguments
93 if(src==NULL) {
94 // treat as an empty string, do nothing more
95 } else if(srcLength<-1) {
96 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
374ca955 97 } else {
73c04bcf
A
98 // get input length
99 if(srcLength==-1) {
100 srcLength=(int32_t)uprv_strlen(src);
101 }
102 if(srcLength>0) {
103 if(cnv!=0) {
104 // use the provided converter
105 ucnv_resetToUnicode(cnv);
106 doCodepageCreate(src, srcLength, cnv, errorCode);
107 } else {
108 // use the default converter
109 cnv=u_getDefaultConverter(&errorCode);
110 doCodepageCreate(src, srcLength, cnv, errorCode);
111 u_releaseDefaultConverter(cnv);
112 }
113 }
374ca955 114 }
374ca955 115
73c04bcf
A
116 if(U_FAILURE(errorCode)) {
117 setToBogus();
118 }
374ca955 119 }
374ca955
A
120}
121
122//========================================
123// Codeset conversion
124//========================================
729e4ab9
A
125
126#if !U_CHARSET_IS_UTF8
127
128int32_t
129UnicodeString::extract(int32_t start,
130 int32_t length,
131 char *target,
132 uint32_t dstSize) const {
133 return extract(start, length, target, dstSize, 0);
134}
135
136// else see unistr.cpp
137#endif
138
374ca955
A
139int32_t
140UnicodeString::extract(int32_t start,
141 int32_t length,
142 char *target,
143 uint32_t dstSize,
144 const char *codepage) const
145{
73c04bcf
A
146 // if the arguments are illegal, then do nothing
147 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
148 return 0;
149 }
150
151 // pin the indices to legal values
152 pinIndices(start, length);
153
729e4ab9
A
154 // We need to cast dstSize to int32_t for all subsequent code.
155 // I don't know why the API was defined with uint32_t but we are stuck with it.
156 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
157 // as a limit in some functions, it may wrap around and yield a pointer
158 // that compares less-than target.
159 int32_t capacity;
160 if(dstSize < 0x7fffffff) {
161 // Assume that the capacity is real and a limit pointer won't wrap around.
162 capacity = (int32_t)dstSize;
163 } else {
164 // Pin the capacity so that a limit pointer does not wrap around.
165 char *targetLimit = (char *)U_MAX_PTR(target);
166 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
167 // greater than target and does not wrap around the top of the address space.
168 capacity = (int32_t)(targetLimit - target);
169 }
170
73c04bcf
A
171 // create the converter
172 UConverter *converter;
173 UErrorCode status = U_ZERO_ERROR;
174
175 // just write the NUL if the string length is 0
176 if(length == 0) {
729e4ab9 177 return u_terminateChars(target, capacity, 0, &status);
73c04bcf
A
178 }
179
180 // if the codepage is the default, use our cache
181 // if it is an empty string, then use the "invariant character" conversion
182 if (codepage == 0) {
729e4ab9
A
183 const char *defaultName = ucnv_getDefaultName();
184 if(UCNV_FAST_IS_UTF8(defaultName)) {
185 return toUTF8(start, length, target, capacity);
186 }
73c04bcf
A
187 converter = u_getDefaultConverter(&status);
188 } else if (*codepage == 0) {
189 // use the "invariant characters" conversion
190 int32_t destLength;
729e4ab9 191 if(length <= capacity) {
73c04bcf
A
192 destLength = length;
193 } else {
729e4ab9 194 destLength = capacity;
73c04bcf
A
195 }
196 u_UCharsToChars(getArrayStart() + start, target, destLength);
729e4ab9 197 return u_terminateChars(target, capacity, length, &status);
374ca955 198 } else {
73c04bcf 199 converter = ucnv_open(codepage, &status);
374ca955 200 }
73c04bcf 201
729e4ab9 202 length = doExtract(start, length, target, capacity, converter, status);
73c04bcf
A
203
204 // close the converter
205 if (codepage == 0) {
206 u_releaseDefaultConverter(converter);
207 } else {
208 ucnv_close(converter);
209 }
210
211 return length;
374ca955
A
212}
213
214int32_t
215UnicodeString::extract(char *dest, int32_t destCapacity,
216 UConverter *cnv,
73c04bcf
A
217 UErrorCode &errorCode) const
218{
374ca955 219 if(U_FAILURE(errorCode)) {
73c04bcf 220 return 0;
374ca955 221 }
374ca955 222
73c04bcf
A
223 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
224 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
225 return 0;
226 }
374ca955 227
73c04bcf 228 // nothing to do?
46f4442e 229 if(isEmpty()) {
73c04bcf
A
230 return u_terminateChars(dest, destCapacity, 0, &errorCode);
231 }
374ca955 232
73c04bcf
A
233 // get the converter
234 UBool isDefaultConverter;
235 if(cnv==0) {
236 isDefaultConverter=TRUE;
237 cnv=u_getDefaultConverter(&errorCode);
238 if(U_FAILURE(errorCode)) {
239 return 0;
240 }
241 } else {
242 isDefaultConverter=FALSE;
243 ucnv_resetFromUnicode(cnv);
244 }
245
246 // convert
46f4442e 247 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
73c04bcf
A
248
249 // release the converter
250 if(isDefaultConverter) {
251 u_releaseDefaultConverter(cnv);
252 }
253
46f4442e 254 return len;
374ca955
A
255}
256
257int32_t
258UnicodeString::doExtract(int32_t start, int32_t length,
259 char *dest, int32_t destCapacity,
260 UConverter *cnv,
73c04bcf
A
261 UErrorCode &errorCode) const
262{
263 if(U_FAILURE(errorCode)) {
264 if(destCapacity!=0) {
265 *dest=0;
266 }
267 return 0;
268 }
269
46f4442e 270 const UChar *src=getArrayStart()+start, *srcLimit=src+length;
73c04bcf
A
271 char *originalDest=dest;
272 const char *destLimit;
273
274 if(destCapacity==0) {
275 destLimit=dest=0;
276 } else if(destCapacity==-1) {
277 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
278 destLimit=(char*)U_MAX_PTR(dest);
279 // for NUL-termination, translate into highest int32_t
280 destCapacity=0x7fffffff;
281 } else {
282 destLimit=dest+destCapacity;
283 }
284
285 // perform the conversion
286 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
287 length=(int32_t)(dest-originalDest);
288
289 // if an overflow occurs, then get the preflighting length
290 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
291 char buffer[1024];
292
293 destLimit=buffer+sizeof(buffer);
294 do {
295 dest=buffer;
296 errorCode=U_ZERO_ERROR;
297 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
298 length+=(int32_t)(dest-buffer);
299 } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
374ca955 300 }
73c04bcf
A
301
302 return u_terminateChars(originalDest, destCapacity, length, &errorCode);
374ca955
A
303}
304
305void
306UnicodeString::doCodepageCreate(const char *codepageData,
73c04bcf
A
307 int32_t dataLength,
308 const char *codepage)
374ca955 309{
73c04bcf
A
310 // if there's nothing to convert, do nothing
311 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
312 return;
313 }
314 if(dataLength == -1) {
315 dataLength = (int32_t)uprv_strlen(codepageData);
316 }
317
318 UErrorCode status = U_ZERO_ERROR;
319
320 // create the converter
321 // if the codepage is the default, use our cache
322 // if it is an empty string, then use the "invariant character" conversion
729e4ab9
A
323 UConverter *converter;
324 if (codepage == 0) {
325 const char *defaultName = ucnv_getDefaultName();
326 if(UCNV_FAST_IS_UTF8(defaultName)) {
327 setToUTF8(StringPiece(codepageData, dataLength));
328 return;
329 }
330 converter = u_getDefaultConverter(&status);
331 } else if(*codepage == 0) {
73c04bcf
A
332 // use the "invariant characters" conversion
333 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
334 u_charsToUChars(codepageData, getArrayStart(), dataLength);
46f4442e 335 setLength(dataLength);
73c04bcf
A
336 } else {
337 setToBogus();
338 }
339 return;
729e4ab9
A
340 } else {
341 converter = ucnv_open(codepage, &status);
73c04bcf
A
342 }
343
729e4ab9
A
344 // if we failed, set the appropriate flags and return
345 if(U_FAILURE(status)) {
346 setToBogus();
347 return;
348 }
349
350 // perform the conversion
73c04bcf
A
351 doCodepageCreate(codepageData, dataLength, converter, status);
352 if(U_FAILURE(status)) {
353 setToBogus();
354 }
355
356 // close the converter
357 if(codepage == 0) {
358 u_releaseDefaultConverter(converter);
374ca955 359 } else {
73c04bcf 360 ucnv_close(converter);
374ca955 361 }
374ca955
A
362}
363
364void
365UnicodeString::doCodepageCreate(const char *codepageData,
366 int32_t dataLength,
367 UConverter *converter,
73c04bcf
A
368 UErrorCode &status)
369{
370 if(U_FAILURE(status)) {
371 return;
374ca955
A
372 }
373
73c04bcf
A
374 // set up the conversion parameters
375 const char *mySource = codepageData;
376 const char *mySourceEnd = mySource + dataLength;
46f4442e 377 UChar *array, *myTarget;
73c04bcf
A
378
379 // estimate the size needed:
46f4442e
A
380 int32_t arraySize;
381 if(dataLength <= US_STACKBUF_SIZE) {
382 // try to use the stack buffer
383 arraySize = US_STACKBUF_SIZE;
384 } else {
385 // 1.25 UChar's per source byte should cover most cases
386 arraySize = dataLength + (dataLength >> 2);
387 }
73c04bcf
A
388
389 // we do not care about the current contents
390 UBool doCopyArray = FALSE;
391 for(;;) {
392 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
393 setToBogus();
394 break;
395 }
374ca955 396
73c04bcf 397 // perform the conversion
46f4442e
A
398 array = getArrayStart();
399 myTarget = array + length();
400 ucnv_toUnicode(converter, &myTarget, array + getCapacity(),
73c04bcf 401 &mySource, mySourceEnd, 0, TRUE, &status);
374ca955 402
73c04bcf 403 // update the conversion parameters
46f4442e 404 setLength((int32_t)(myTarget - array));
374ca955 405
73c04bcf
A
406 // allocate more space and copy data, if needed
407 if(status == U_BUFFER_OVERFLOW_ERROR) {
408 // reset the error code
409 status = U_ZERO_ERROR;
374ca955 410
73c04bcf
A
411 // keep the previous conversion results
412 doCopyArray = TRUE;
413
414 // estimate the new size needed, larger than before
415 // try 2 UChar's per remaining source byte
46f4442e 416 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
73c04bcf
A
417 } else {
418 break;
419 }
374ca955 420 }
374ca955
A
421}
422
423U_NAMESPACE_END
424
425#endif