]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unistr_cnv.cpp
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / common / unistr_cnv.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
374ca955
A
3/*
4*******************************************************************************
5*
b331163b 6* Copyright (C) 1999-2014, International Business Machines
374ca955
A
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: unistr_cnv.cpp
f3c0d7a5 11* encoding: UTF-8
374ca955
A
12* tab size: 8 (not used)
13* indentation:2
14*
15* created on: 2004aug19
16* created by: Markus W. Scherer
17*
18* Character conversion functions moved here from unistr.cpp
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_CONVERSION
24
25#include "unicode/putil.h"
26#include "cstring.h"
27#include "cmemory.h"
28#include "unicode/ustring.h"
29#include "unicode/unistr.h"
30#include "unicode/ucnv.h"
729e4ab9 31#include "ucnv_imp.h"
374ca955
A
32#include "putilimp.h"
33#include "ustr_cnv.h"
34#include "ustr_imp.h"
35
36U_NAMESPACE_BEGIN
37
38//========================================
39// Constructors
40//========================================
41
729e4ab9
A
42#if !U_CHARSET_IS_UTF8
43
b331163b
A
44UnicodeString::UnicodeString(const char *codepageData) {
45 fUnion.fFields.fLengthAndFlags = kShortString;
729e4ab9
A
46 if(codepageData != 0) {
47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
48 }
49}
50
51UnicodeString::UnicodeString(const char *codepageData,
b331163b
A
52 int32_t dataLength) {
53 fUnion.fFields.fLengthAndFlags = kShortString;
729e4ab9
A
54 if(codepageData != 0) {
55 doCodepageCreate(codepageData, dataLength, 0);
56 }
57}
58
59// else see unistr.cpp
60#endif
61
374ca955 62UnicodeString::UnicodeString(const char *codepageData,
b331163b
A
63 const char *codepage) {
64 fUnion.fFields.fLengthAndFlags = kShortString;
73c04bcf
A
65 if(codepageData != 0) {
66 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
67 }
374ca955
A
68}
69
374ca955
A
70UnicodeString::UnicodeString(const char *codepageData,
71 int32_t dataLength,
b331163b
A
72 const char *codepage) {
73 fUnion.fFields.fLengthAndFlags = kShortString;
73c04bcf
A
74 if(codepageData != 0) {
75 doCodepageCreate(codepageData, dataLength, codepage);
76 }
374ca955
A
77}
78
79UnicodeString::UnicodeString(const char *src, int32_t srcLength,
80 UConverter *cnv,
b331163b
A
81 UErrorCode &errorCode) {
82 fUnion.fFields.fLengthAndFlags = kShortString;
73c04bcf
A
83 if(U_SUCCESS(errorCode)) {
84 // check arguments
85 if(src==NULL) {
86 // treat as an empty string, do nothing more
87 } else if(srcLength<-1) {
88 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
374ca955 89 } else {
73c04bcf
A
90 // get input length
91 if(srcLength==-1) {
92 srcLength=(int32_t)uprv_strlen(src);
93 }
94 if(srcLength>0) {
95 if(cnv!=0) {
96 // use the provided converter
97 ucnv_resetToUnicode(cnv);
98 doCodepageCreate(src, srcLength, cnv, errorCode);
99 } else {
100 // use the default converter
101 cnv=u_getDefaultConverter(&errorCode);
102 doCodepageCreate(src, srcLength, cnv, errorCode);
103 u_releaseDefaultConverter(cnv);
104 }
105 }
374ca955 106 }
374ca955 107
73c04bcf
A
108 if(U_FAILURE(errorCode)) {
109 setToBogus();
110 }
374ca955 111 }
374ca955
A
112}
113
114//========================================
115// Codeset conversion
116//========================================
729e4ab9
A
117
118#if !U_CHARSET_IS_UTF8
119
120int32_t
121UnicodeString::extract(int32_t start,
122 int32_t length,
123 char *target,
124 uint32_t dstSize) const {
125 return extract(start, length, target, dstSize, 0);
126}
127
128// else see unistr.cpp
129#endif
130
374ca955
A
131int32_t
132UnicodeString::extract(int32_t start,
133 int32_t length,
134 char *target,
135 uint32_t dstSize,
136 const char *codepage) const
137{
73c04bcf
A
138 // if the arguments are illegal, then do nothing
139 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
140 return 0;
141 }
142
143 // pin the indices to legal values
144 pinIndices(start, length);
145
729e4ab9
A
146 // We need to cast dstSize to int32_t for all subsequent code.
147 // I don't know why the API was defined with uint32_t but we are stuck with it.
148 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
149 // as a limit in some functions, it may wrap around and yield a pointer
150 // that compares less-than target.
151 int32_t capacity;
152 if(dstSize < 0x7fffffff) {
153 // Assume that the capacity is real and a limit pointer won't wrap around.
154 capacity = (int32_t)dstSize;
155 } else {
156 // Pin the capacity so that a limit pointer does not wrap around.
157 char *targetLimit = (char *)U_MAX_PTR(target);
158 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
159 // greater than target and does not wrap around the top of the address space.
160 capacity = (int32_t)(targetLimit - target);
161 }
162
73c04bcf
A
163 // create the converter
164 UConverter *converter;
165 UErrorCode status = U_ZERO_ERROR;
166
167 // just write the NUL if the string length is 0
168 if(length == 0) {
729e4ab9 169 return u_terminateChars(target, capacity, 0, &status);
73c04bcf
A
170 }
171
172 // if the codepage is the default, use our cache
173 // if it is an empty string, then use the "invariant character" conversion
174 if (codepage == 0) {
729e4ab9
A
175 const char *defaultName = ucnv_getDefaultName();
176 if(UCNV_FAST_IS_UTF8(defaultName)) {
177 return toUTF8(start, length, target, capacity);
178 }
73c04bcf
A
179 converter = u_getDefaultConverter(&status);
180 } else if (*codepage == 0) {
181 // use the "invariant characters" conversion
182 int32_t destLength;
729e4ab9 183 if(length <= capacity) {
73c04bcf
A
184 destLength = length;
185 } else {
729e4ab9 186 destLength = capacity;
73c04bcf
A
187 }
188 u_UCharsToChars(getArrayStart() + start, target, destLength);
729e4ab9 189 return u_terminateChars(target, capacity, length, &status);
374ca955 190 } else {
73c04bcf 191 converter = ucnv_open(codepage, &status);
374ca955 192 }
73c04bcf 193
729e4ab9 194 length = doExtract(start, length, target, capacity, converter, status);
73c04bcf
A
195
196 // close the converter
197 if (codepage == 0) {
198 u_releaseDefaultConverter(converter);
199 } else {
200 ucnv_close(converter);
201 }
202
203 return length;
374ca955
A
204}
205
206int32_t
207UnicodeString::extract(char *dest, int32_t destCapacity,
208 UConverter *cnv,
73c04bcf
A
209 UErrorCode &errorCode) const
210{
374ca955 211 if(U_FAILURE(errorCode)) {
73c04bcf 212 return 0;
374ca955 213 }
374ca955 214
73c04bcf
A
215 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
216 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
217 return 0;
218 }
374ca955 219
73c04bcf 220 // nothing to do?
46f4442e 221 if(isEmpty()) {
73c04bcf
A
222 return u_terminateChars(dest, destCapacity, 0, &errorCode);
223 }
374ca955 224
73c04bcf
A
225 // get the converter
226 UBool isDefaultConverter;
227 if(cnv==0) {
228 isDefaultConverter=TRUE;
229 cnv=u_getDefaultConverter(&errorCode);
230 if(U_FAILURE(errorCode)) {
231 return 0;
232 }
233 } else {
234 isDefaultConverter=FALSE;
235 ucnv_resetFromUnicode(cnv);
236 }
237
238 // convert
46f4442e 239 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
73c04bcf
A
240
241 // release the converter
242 if(isDefaultConverter) {
243 u_releaseDefaultConverter(cnv);
244 }
245
46f4442e 246 return len;
374ca955
A
247}
248
249int32_t
250UnicodeString::doExtract(int32_t start, int32_t length,
251 char *dest, int32_t destCapacity,
252 UConverter *cnv,
73c04bcf
A
253 UErrorCode &errorCode) const
254{
255 if(U_FAILURE(errorCode)) {
256 if(destCapacity!=0) {
257 *dest=0;
258 }
259 return 0;
260 }
261
46f4442e 262 const UChar *src=getArrayStart()+start, *srcLimit=src+length;
73c04bcf
A
263 char *originalDest=dest;
264 const char *destLimit;
265
266 if(destCapacity==0) {
267 destLimit=dest=0;
268 } else if(destCapacity==-1) {
269 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
270 destLimit=(char*)U_MAX_PTR(dest);
271 // for NUL-termination, translate into highest int32_t
272 destCapacity=0x7fffffff;
273 } else {
274 destLimit=dest+destCapacity;
275 }
276
277 // perform the conversion
278 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
279 length=(int32_t)(dest-originalDest);
280
281 // if an overflow occurs, then get the preflighting length
282 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
283 char buffer[1024];
284
285 destLimit=buffer+sizeof(buffer);
286 do {
287 dest=buffer;
288 errorCode=U_ZERO_ERROR;
289 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
290 length+=(int32_t)(dest-buffer);
291 } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
374ca955 292 }
73c04bcf
A
293
294 return u_terminateChars(originalDest, destCapacity, length, &errorCode);
374ca955
A
295}
296
297void
298UnicodeString::doCodepageCreate(const char *codepageData,
73c04bcf
A
299 int32_t dataLength,
300 const char *codepage)
374ca955 301{
73c04bcf
A
302 // if there's nothing to convert, do nothing
303 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
304 return;
305 }
306 if(dataLength == -1) {
307 dataLength = (int32_t)uprv_strlen(codepageData);
308 }
309
310 UErrorCode status = U_ZERO_ERROR;
311
312 // create the converter
313 // if the codepage is the default, use our cache
314 // if it is an empty string, then use the "invariant character" conversion
729e4ab9
A
315 UConverter *converter;
316 if (codepage == 0) {
317 const char *defaultName = ucnv_getDefaultName();
318 if(UCNV_FAST_IS_UTF8(defaultName)) {
319 setToUTF8(StringPiece(codepageData, dataLength));
320 return;
321 }
322 converter = u_getDefaultConverter(&status);
323 } else if(*codepage == 0) {
73c04bcf
A
324 // use the "invariant characters" conversion
325 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
326 u_charsToUChars(codepageData, getArrayStart(), dataLength);
46f4442e 327 setLength(dataLength);
73c04bcf
A
328 } else {
329 setToBogus();
330 }
331 return;
729e4ab9
A
332 } else {
333 converter = ucnv_open(codepage, &status);
73c04bcf
A
334 }
335
729e4ab9
A
336 // if we failed, set the appropriate flags and return
337 if(U_FAILURE(status)) {
338 setToBogus();
339 return;
340 }
341
342 // perform the conversion
73c04bcf
A
343 doCodepageCreate(codepageData, dataLength, converter, status);
344 if(U_FAILURE(status)) {
345 setToBogus();
346 }
347
348 // close the converter
349 if(codepage == 0) {
350 u_releaseDefaultConverter(converter);
374ca955 351 } else {
73c04bcf 352 ucnv_close(converter);
374ca955 353 }
374ca955
A
354}
355
356void
357UnicodeString::doCodepageCreate(const char *codepageData,
358 int32_t dataLength,
359 UConverter *converter,
73c04bcf
A
360 UErrorCode &status)
361{
362 if(U_FAILURE(status)) {
363 return;
374ca955
A
364 }
365
73c04bcf
A
366 // set up the conversion parameters
367 const char *mySource = codepageData;
368 const char *mySourceEnd = mySource + dataLength;
46f4442e 369 UChar *array, *myTarget;
73c04bcf
A
370
371 // estimate the size needed:
46f4442e
A
372 int32_t arraySize;
373 if(dataLength <= US_STACKBUF_SIZE) {
374 // try to use the stack buffer
375 arraySize = US_STACKBUF_SIZE;
376 } else {
377 // 1.25 UChar's per source byte should cover most cases
378 arraySize = dataLength + (dataLength >> 2);
379 }
73c04bcf
A
380
381 // we do not care about the current contents
382 UBool doCopyArray = FALSE;
383 for(;;) {
384 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
385 setToBogus();
386 break;
387 }
374ca955 388
73c04bcf 389 // perform the conversion
46f4442e
A
390 array = getArrayStart();
391 myTarget = array + length();
392 ucnv_toUnicode(converter, &myTarget, array + getCapacity(),
73c04bcf 393 &mySource, mySourceEnd, 0, TRUE, &status);
374ca955 394
73c04bcf 395 // update the conversion parameters
46f4442e 396 setLength((int32_t)(myTarget - array));
374ca955 397
73c04bcf
A
398 // allocate more space and copy data, if needed
399 if(status == U_BUFFER_OVERFLOW_ERROR) {
400 // reset the error code
401 status = U_ZERO_ERROR;
374ca955 402
73c04bcf
A
403 // keep the previous conversion results
404 doCopyArray = TRUE;
405
406 // estimate the new size needed, larger than before
407 // try 2 UChar's per remaining source byte
46f4442e 408 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
73c04bcf
A
409 } else {
410 break;
411 }
374ca955 412 }
374ca955
A
413}
414
415U_NAMESPACE_END
416
417#endif