]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/unistr_cnv.cpp
2 *******************************************************************************
4 * Copyright (C) 1999-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: unistr_cnv.cpp
10 * tab size: 8 (not used)
13 * created on: 2004aug19
14 * created by: Markus W. Scherer
16 * Character conversion functions moved here from unistr.cpp
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_CONVERSION
23 #include "unicode/putil.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/ucnv.h"
36 //========================================
38 //========================================
40 #if !U_CHARSET_IS_UTF8
42 UnicodeString::UnicodeString(const char *codepageData
) {
43 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
44 if(codepageData
!= 0) {
45 doCodepageCreate(codepageData
, (int32_t)uprv_strlen(codepageData
), 0);
49 UnicodeString::UnicodeString(const char *codepageData
,
51 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
52 if(codepageData
!= 0) {
53 doCodepageCreate(codepageData
, dataLength
, 0);
57 // else see unistr.cpp
60 UnicodeString::UnicodeString(const char *codepageData
,
61 const char *codepage
) {
62 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
63 if(codepageData
!= 0) {
64 doCodepageCreate(codepageData
, (int32_t)uprv_strlen(codepageData
), codepage
);
68 UnicodeString::UnicodeString(const char *codepageData
,
70 const char *codepage
) {
71 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
72 if(codepageData
!= 0) {
73 doCodepageCreate(codepageData
, dataLength
, codepage
);
77 UnicodeString::UnicodeString(const char *src
, int32_t srcLength
,
79 UErrorCode
&errorCode
) {
80 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
81 if(U_SUCCESS(errorCode
)) {
84 // treat as an empty string, do nothing more
85 } else if(srcLength
<-1) {
86 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
90 srcLength
=(int32_t)uprv_strlen(src
);
94 // use the provided converter
95 ucnv_resetToUnicode(cnv
);
96 doCodepageCreate(src
, srcLength
, cnv
, errorCode
);
98 // use the default converter
99 cnv
=u_getDefaultConverter(&errorCode
);
100 doCodepageCreate(src
, srcLength
, cnv
, errorCode
);
101 u_releaseDefaultConverter(cnv
);
106 if(U_FAILURE(errorCode
)) {
112 //========================================
113 // Codeset conversion
114 //========================================
116 #if !U_CHARSET_IS_UTF8
119 UnicodeString::extract(int32_t start
,
122 uint32_t dstSize
) const {
123 return extract(start
, length
, target
, dstSize
, 0);
126 // else see unistr.cpp
130 UnicodeString::extract(int32_t start
,
134 const char *codepage
) const
136 // if the arguments are illegal, then do nothing
137 if(/*dstSize < 0 || */(dstSize
> 0 && target
== 0)) {
141 // pin the indices to legal values
142 pinIndices(start
, length
);
144 // We need to cast dstSize to int32_t for all subsequent code.
145 // I don't know why the API was defined with uint32_t but we are stuck with it.
146 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
147 // as a limit in some functions, it may wrap around and yield a pointer
148 // that compares less-than target.
150 if(dstSize
< 0x7fffffff) {
151 // Assume that the capacity is real and a limit pointer won't wrap around.
152 capacity
= (int32_t)dstSize
;
154 // Pin the capacity so that a limit pointer does not wrap around.
155 char *targetLimit
= (char *)U_MAX_PTR(target
);
156 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
157 // greater than target and does not wrap around the top of the address space.
158 capacity
= (int32_t)(targetLimit
- target
);
161 // create the converter
162 UConverter
*converter
;
163 UErrorCode status
= U_ZERO_ERROR
;
165 // just write the NUL if the string length is 0
167 return u_terminateChars(target
, capacity
, 0, &status
);
170 // if the codepage is the default, use our cache
171 // if it is an empty string, then use the "invariant character" conversion
173 const char *defaultName
= ucnv_getDefaultName();
174 if(UCNV_FAST_IS_UTF8(defaultName
)) {
175 return toUTF8(start
, length
, target
, capacity
);
177 converter
= u_getDefaultConverter(&status
);
178 } else if (*codepage
== 0) {
179 // use the "invariant characters" conversion
181 if(length
<= capacity
) {
184 destLength
= capacity
;
186 u_UCharsToChars(getArrayStart() + start
, target
, destLength
);
187 return u_terminateChars(target
, capacity
, length
, &status
);
189 converter
= ucnv_open(codepage
, &status
);
192 length
= doExtract(start
, length
, target
, capacity
, converter
, status
);
194 // close the converter
196 u_releaseDefaultConverter(converter
);
198 ucnv_close(converter
);
205 UnicodeString::extract(char *dest
, int32_t destCapacity
,
207 UErrorCode
&errorCode
) const
209 if(U_FAILURE(errorCode
)) {
213 if(isBogus() || destCapacity
<0 || (destCapacity
>0 && dest
==0)) {
214 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
220 return u_terminateChars(dest
, destCapacity
, 0, &errorCode
);
224 UBool isDefaultConverter
;
226 isDefaultConverter
=TRUE
;
227 cnv
=u_getDefaultConverter(&errorCode
);
228 if(U_FAILURE(errorCode
)) {
232 isDefaultConverter
=FALSE
;
233 ucnv_resetFromUnicode(cnv
);
237 int32_t len
=doExtract(0, length(), dest
, destCapacity
, cnv
, errorCode
);
239 // release the converter
240 if(isDefaultConverter
) {
241 u_releaseDefaultConverter(cnv
);
248 UnicodeString::doExtract(int32_t start
, int32_t length
,
249 char *dest
, int32_t destCapacity
,
251 UErrorCode
&errorCode
) const
253 if(U_FAILURE(errorCode
)) {
254 if(destCapacity
!=0) {
260 const UChar
*src
=getArrayStart()+start
, *srcLimit
=src
+length
;
261 char *originalDest
=dest
;
262 const char *destLimit
;
264 if(destCapacity
==0) {
266 } else if(destCapacity
==-1) {
267 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
268 destLimit
=(char*)U_MAX_PTR(dest
);
269 // for NUL-termination, translate into highest int32_t
270 destCapacity
=0x7fffffff;
272 destLimit
=dest
+destCapacity
;
275 // perform the conversion
276 ucnv_fromUnicode(cnv
, &dest
, destLimit
, &src
, srcLimit
, 0, TRUE
, &errorCode
);
277 length
=(int32_t)(dest
-originalDest
);
279 // if an overflow occurs, then get the preflighting length
280 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
283 destLimit
=buffer
+sizeof(buffer
);
286 errorCode
=U_ZERO_ERROR
;
287 ucnv_fromUnicode(cnv
, &dest
, destLimit
, &src
, srcLimit
, 0, TRUE
, &errorCode
);
288 length
+=(int32_t)(dest
-buffer
);
289 } while(errorCode
==U_BUFFER_OVERFLOW_ERROR
);
292 return u_terminateChars(originalDest
, destCapacity
, length
, &errorCode
);
296 UnicodeString::doCodepageCreate(const char *codepageData
,
298 const char *codepage
)
300 // if there's nothing to convert, do nothing
301 if(codepageData
== 0 || dataLength
== 0 || dataLength
< -1) {
304 if(dataLength
== -1) {
305 dataLength
= (int32_t)uprv_strlen(codepageData
);
308 UErrorCode status
= U_ZERO_ERROR
;
310 // create the converter
311 // if the codepage is the default, use our cache
312 // if it is an empty string, then use the "invariant character" conversion
313 UConverter
*converter
;
315 const char *defaultName
= ucnv_getDefaultName();
316 if(UCNV_FAST_IS_UTF8(defaultName
)) {
317 setToUTF8(StringPiece(codepageData
, dataLength
));
320 converter
= u_getDefaultConverter(&status
);
321 } else if(*codepage
== 0) {
322 // use the "invariant characters" conversion
323 if(cloneArrayIfNeeded(dataLength
, dataLength
, FALSE
)) {
324 u_charsToUChars(codepageData
, getArrayStart(), dataLength
);
325 setLength(dataLength
);
331 converter
= ucnv_open(codepage
, &status
);
334 // if we failed, set the appropriate flags and return
335 if(U_FAILURE(status
)) {
340 // perform the conversion
341 doCodepageCreate(codepageData
, dataLength
, converter
, status
);
342 if(U_FAILURE(status
)) {
346 // close the converter
348 u_releaseDefaultConverter(converter
);
350 ucnv_close(converter
);
355 UnicodeString::doCodepageCreate(const char *codepageData
,
357 UConverter
*converter
,
360 if(U_FAILURE(status
)) {
364 // set up the conversion parameters
365 const char *mySource
= codepageData
;
366 const char *mySourceEnd
= mySource
+ dataLength
;
367 UChar
*array
, *myTarget
;
369 // estimate the size needed:
371 if(dataLength
<= US_STACKBUF_SIZE
) {
372 // try to use the stack buffer
373 arraySize
= US_STACKBUF_SIZE
;
375 // 1.25 UChar's per source byte should cover most cases
376 arraySize
= dataLength
+ (dataLength
>> 2);
379 // we do not care about the current contents
380 UBool doCopyArray
= FALSE
;
382 if(!cloneArrayIfNeeded(arraySize
, arraySize
, doCopyArray
)) {
387 // perform the conversion
388 array
= getArrayStart();
389 myTarget
= array
+ length();
390 ucnv_toUnicode(converter
, &myTarget
, array
+ getCapacity(),
391 &mySource
, mySourceEnd
, 0, TRUE
, &status
);
393 // update the conversion parameters
394 setLength((int32_t)(myTarget
- array
));
396 // allocate more space and copy data, if needed
397 if(status
== U_BUFFER_OVERFLOW_ERROR
) {
398 // reset the error code
399 status
= U_ZERO_ERROR
;
401 // keep the previous conversion results
404 // estimate the new size needed, larger than before
405 // try 2 UChar's per remaining source byte
406 arraySize
= (int32_t)(length() + 2 * (mySourceEnd
- mySource
));