]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/unistr_cnv.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 1999-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: unistr_cnv.cpp
12 * tab size: 8 (not used)
15 * created on: 2004aug19
16 * created by: Markus W. Scherer
18 * Character conversion functions moved here from unistr.cpp
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_CONVERSION
25 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "unicode/unistr.h"
30 #include "unicode/ucnv.h"
38 //========================================
40 //========================================
42 #if !U_CHARSET_IS_UTF8
44 UnicodeString::UnicodeString(const char *codepageData
) {
45 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
46 if(codepageData
!= 0) {
47 doCodepageCreate(codepageData
, (int32_t)uprv_strlen(codepageData
), 0);
51 UnicodeString::UnicodeString(const char *codepageData
,
53 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
54 if(codepageData
!= 0) {
55 doCodepageCreate(codepageData
, dataLength
, 0);
59 // else see unistr.cpp
62 UnicodeString::UnicodeString(const char *codepageData
,
63 const char *codepage
) {
64 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
65 if(codepageData
!= 0) {
66 doCodepageCreate(codepageData
, (int32_t)uprv_strlen(codepageData
), codepage
);
70 UnicodeString::UnicodeString(const char *codepageData
,
72 const char *codepage
) {
73 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
74 if(codepageData
!= 0) {
75 doCodepageCreate(codepageData
, dataLength
, codepage
);
79 UnicodeString::UnicodeString(const char *src
, int32_t srcLength
,
81 UErrorCode
&errorCode
) {
82 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
83 if(U_SUCCESS(errorCode
)) {
86 // treat as an empty string, do nothing more
87 } else if(srcLength
<-1) {
88 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
92 srcLength
=(int32_t)uprv_strlen(src
);
96 // use the provided converter
97 ucnv_resetToUnicode(cnv
);
98 doCodepageCreate(src
, srcLength
, cnv
, errorCode
);
100 // use the default converter
101 cnv
=u_getDefaultConverter(&errorCode
);
102 doCodepageCreate(src
, srcLength
, cnv
, errorCode
);
103 u_releaseDefaultConverter(cnv
);
108 if(U_FAILURE(errorCode
)) {
114 //========================================
115 // Codeset conversion
116 //========================================
118 #if !U_CHARSET_IS_UTF8
121 UnicodeString::extract(int32_t start
,
124 uint32_t dstSize
) const {
125 return extract(start
, length
, target
, dstSize
, 0);
128 // else see unistr.cpp
132 UnicodeString::extract(int32_t start
,
136 const char *codepage
) const
138 // if the arguments are illegal, then do nothing
139 if(/*dstSize < 0 || */(dstSize
> 0 && target
== 0)) {
143 // pin the indices to legal values
144 pinIndices(start
, length
);
146 // We need to cast dstSize to int32_t for all subsequent code.
147 // I don't know why the API was defined with uint32_t but we are stuck with it.
148 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
149 // as a limit in some functions, it may wrap around and yield a pointer
150 // that compares less-than target.
152 if(dstSize
< 0x7fffffff) {
153 // Assume that the capacity is real and a limit pointer won't wrap around.
154 capacity
= (int32_t)dstSize
;
156 // Pin the capacity so that a limit pointer does not wrap around.
157 char *targetLimit
= (char *)U_MAX_PTR(target
);
158 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
159 // greater than target and does not wrap around the top of the address space.
160 capacity
= (int32_t)(targetLimit
- target
);
163 // create the converter
164 UConverter
*converter
;
165 UErrorCode status
= U_ZERO_ERROR
;
167 // just write the NUL if the string length is 0
169 return u_terminateChars(target
, capacity
, 0, &status
);
172 // if the codepage is the default, use our cache
173 // if it is an empty string, then use the "invariant character" conversion
175 const char *defaultName
= ucnv_getDefaultName();
176 if(UCNV_FAST_IS_UTF8(defaultName
)) {
177 return toUTF8(start
, length
, target
, capacity
);
179 converter
= u_getDefaultConverter(&status
);
180 } else if (*codepage
== 0) {
181 // use the "invariant characters" conversion
183 if(length
<= capacity
) {
186 destLength
= capacity
;
188 u_UCharsToChars(getArrayStart() + start
, target
, destLength
);
189 return u_terminateChars(target
, capacity
, length
, &status
);
191 converter
= ucnv_open(codepage
, &status
);
194 length
= doExtract(start
, length
, target
, capacity
, converter
, status
);
196 // close the converter
198 u_releaseDefaultConverter(converter
);
200 ucnv_close(converter
);
207 UnicodeString::extract(char *dest
, int32_t destCapacity
,
209 UErrorCode
&errorCode
) const
211 if(U_FAILURE(errorCode
)) {
215 if(isBogus() || destCapacity
<0 || (destCapacity
>0 && dest
==0)) {
216 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
222 return u_terminateChars(dest
, destCapacity
, 0, &errorCode
);
226 UBool isDefaultConverter
;
228 isDefaultConverter
=TRUE
;
229 cnv
=u_getDefaultConverter(&errorCode
);
230 if(U_FAILURE(errorCode
)) {
234 isDefaultConverter
=FALSE
;
235 ucnv_resetFromUnicode(cnv
);
239 int32_t len
=doExtract(0, length(), dest
, destCapacity
, cnv
, errorCode
);
241 // release the converter
242 if(isDefaultConverter
) {
243 u_releaseDefaultConverter(cnv
);
250 UnicodeString::doExtract(int32_t start
, int32_t length
,
251 char *dest
, int32_t destCapacity
,
253 UErrorCode
&errorCode
) const
255 if(U_FAILURE(errorCode
)) {
256 if(destCapacity
!=0) {
262 const UChar
*src
=getArrayStart()+start
, *srcLimit
=src
+length
;
263 char *originalDest
=dest
;
264 const char *destLimit
;
266 if(destCapacity
==0) {
268 } else if(destCapacity
==-1) {
269 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
270 destLimit
=(char*)U_MAX_PTR(dest
);
271 // for NUL-termination, translate into highest int32_t
272 destCapacity
=0x7fffffff;
274 destLimit
=dest
+destCapacity
;
277 // perform the conversion
278 ucnv_fromUnicode(cnv
, &dest
, destLimit
, &src
, srcLimit
, 0, TRUE
, &errorCode
);
279 length
=(int32_t)(dest
-originalDest
);
281 // if an overflow occurs, then get the preflighting length
282 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
285 destLimit
=buffer
+sizeof(buffer
);
288 errorCode
=U_ZERO_ERROR
;
289 ucnv_fromUnicode(cnv
, &dest
, destLimit
, &src
, srcLimit
, 0, TRUE
, &errorCode
);
290 length
+=(int32_t)(dest
-buffer
);
291 } while(errorCode
==U_BUFFER_OVERFLOW_ERROR
);
294 return u_terminateChars(originalDest
, destCapacity
, length
, &errorCode
);
298 UnicodeString::doCodepageCreate(const char *codepageData
,
300 const char *codepage
)
302 // if there's nothing to convert, do nothing
303 if(codepageData
== 0 || dataLength
== 0 || dataLength
< -1) {
306 if(dataLength
== -1) {
307 dataLength
= (int32_t)uprv_strlen(codepageData
);
310 UErrorCode status
= U_ZERO_ERROR
;
312 // create the converter
313 // if the codepage is the default, use our cache
314 // if it is an empty string, then use the "invariant character" conversion
315 UConverter
*converter
;
317 const char *defaultName
= ucnv_getDefaultName();
318 if(UCNV_FAST_IS_UTF8(defaultName
)) {
319 setToUTF8(StringPiece(codepageData
, dataLength
));
322 converter
= u_getDefaultConverter(&status
);
323 } else if(*codepage
== 0) {
324 // use the "invariant characters" conversion
325 if(cloneArrayIfNeeded(dataLength
, dataLength
, FALSE
)) {
326 u_charsToUChars(codepageData
, getArrayStart(), dataLength
);
327 setLength(dataLength
);
333 converter
= ucnv_open(codepage
, &status
);
336 // if we failed, set the appropriate flags and return
337 if(U_FAILURE(status
)) {
342 // perform the conversion
343 doCodepageCreate(codepageData
, dataLength
, converter
, status
);
344 if(U_FAILURE(status
)) {
348 // close the converter
350 u_releaseDefaultConverter(converter
);
352 ucnv_close(converter
);
357 UnicodeString::doCodepageCreate(const char *codepageData
,
359 UConverter
*converter
,
362 if(U_FAILURE(status
)) {
366 // set up the conversion parameters
367 const char *mySource
= codepageData
;
368 const char *mySourceEnd
= mySource
+ dataLength
;
369 UChar
*array
, *myTarget
;
371 // estimate the size needed:
373 if(dataLength
<= US_STACKBUF_SIZE
) {
374 // try to use the stack buffer
375 arraySize
= US_STACKBUF_SIZE
;
377 // 1.25 UChar's per source byte should cover most cases
378 arraySize
= dataLength
+ (dataLength
>> 2);
381 // we do not care about the current contents
382 UBool doCopyArray
= FALSE
;
384 if(!cloneArrayIfNeeded(arraySize
, arraySize
, doCopyArray
)) {
389 // perform the conversion
390 array
= getArrayStart();
391 myTarget
= array
+ length();
392 ucnv_toUnicode(converter
, &myTarget
, array
+ getCapacity(),
393 &mySource
, mySourceEnd
, 0, TRUE
, &status
);
395 // update the conversion parameters
396 setLength((int32_t)(myTarget
- array
));
398 // allocate more space and copy data, if needed
399 if(status
== U_BUFFER_OVERFLOW_ERROR
) {
400 // reset the error code
401 status
= U_ZERO_ERROR
;
403 // keep the previous conversion results
406 // estimate the new size needed, larger than before
407 // try 2 UChar's per remaining source byte
408 arraySize
= (int32_t)(length() + 2 * (mySourceEnd
- mySource
));