]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/unistr_cnv.cpp
2 *******************************************************************************
4 * Copyright (C) 1999-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: unistr_cnv.cpp
10 * tab size: 8 (not used)
13 * created on: 2004aug19
14 * created by: Markus W. Scherer
16 * Character conversion functions moved here from unistr.cpp
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_CONVERSION
23 #include "unicode/putil.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/ucnv.h"
36 //========================================
38 //========================================
40 #if !U_CHARSET_IS_UTF8
42 UnicodeString::UnicodeString(const char *codepageData
)
46 if(codepageData
!= 0) {
47 doCodepageCreate(codepageData
, (int32_t)uprv_strlen(codepageData
), 0);
51 UnicodeString::UnicodeString(const char *codepageData
,
56 if(codepageData
!= 0) {
57 doCodepageCreate(codepageData
, dataLength
, 0);
61 // else see unistr.cpp
64 UnicodeString::UnicodeString(const char *codepageData
,
69 if(codepageData
!= 0) {
70 doCodepageCreate(codepageData
, (int32_t)uprv_strlen(codepageData
), codepage
);
74 UnicodeString::UnicodeString(const char *codepageData
,
80 if(codepageData
!= 0) {
81 doCodepageCreate(codepageData
, dataLength
, codepage
);
85 UnicodeString::UnicodeString(const char *src
, int32_t srcLength
,
87 UErrorCode
&errorCode
)
91 if(U_SUCCESS(errorCode
)) {
94 // treat as an empty string, do nothing more
95 } else if(srcLength
<-1) {
96 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
100 srcLength
=(int32_t)uprv_strlen(src
);
104 // use the provided converter
105 ucnv_resetToUnicode(cnv
);
106 doCodepageCreate(src
, srcLength
, cnv
, errorCode
);
108 // use the default converter
109 cnv
=u_getDefaultConverter(&errorCode
);
110 doCodepageCreate(src
, srcLength
, cnv
, errorCode
);
111 u_releaseDefaultConverter(cnv
);
116 if(U_FAILURE(errorCode
)) {
122 //========================================
123 // Codeset conversion
124 //========================================
126 #if !U_CHARSET_IS_UTF8
129 UnicodeString::extract(int32_t start
,
132 uint32_t dstSize
) const {
133 return extract(start
, length
, target
, dstSize
, 0);
136 // else see unistr.cpp
140 UnicodeString::extract(int32_t start
,
144 const char *codepage
) const
146 // if the arguments are illegal, then do nothing
147 if(/*dstSize < 0 || */(dstSize
> 0 && target
== 0)) {
151 // pin the indices to legal values
152 pinIndices(start
, length
);
154 // We need to cast dstSize to int32_t for all subsequent code.
155 // I don't know why the API was defined with uint32_t but we are stuck with it.
156 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
157 // as a limit in some functions, it may wrap around and yield a pointer
158 // that compares less-than target.
160 if(dstSize
< 0x7fffffff) {
161 // Assume that the capacity is real and a limit pointer won't wrap around.
162 capacity
= (int32_t)dstSize
;
164 // Pin the capacity so that a limit pointer does not wrap around.
165 char *targetLimit
= (char *)U_MAX_PTR(target
);
166 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
167 // greater than target and does not wrap around the top of the address space.
168 capacity
= (int32_t)(targetLimit
- target
);
171 // create the converter
172 UConverter
*converter
;
173 UErrorCode status
= U_ZERO_ERROR
;
175 // just write the NUL if the string length is 0
177 return u_terminateChars(target
, capacity
, 0, &status
);
180 // if the codepage is the default, use our cache
181 // if it is an empty string, then use the "invariant character" conversion
183 const char *defaultName
= ucnv_getDefaultName();
184 if(UCNV_FAST_IS_UTF8(defaultName
)) {
185 return toUTF8(start
, length
, target
, capacity
);
187 converter
= u_getDefaultConverter(&status
);
188 } else if (*codepage
== 0) {
189 // use the "invariant characters" conversion
191 if(length
<= capacity
) {
194 destLength
= capacity
;
196 u_UCharsToChars(getArrayStart() + start
, target
, destLength
);
197 return u_terminateChars(target
, capacity
, length
, &status
);
199 converter
= ucnv_open(codepage
, &status
);
202 length
= doExtract(start
, length
, target
, capacity
, converter
, status
);
204 // close the converter
206 u_releaseDefaultConverter(converter
);
208 ucnv_close(converter
);
215 UnicodeString::extract(char *dest
, int32_t destCapacity
,
217 UErrorCode
&errorCode
) const
219 if(U_FAILURE(errorCode
)) {
223 if(isBogus() || destCapacity
<0 || (destCapacity
>0 && dest
==0)) {
224 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
230 return u_terminateChars(dest
, destCapacity
, 0, &errorCode
);
234 UBool isDefaultConverter
;
236 isDefaultConverter
=TRUE
;
237 cnv
=u_getDefaultConverter(&errorCode
);
238 if(U_FAILURE(errorCode
)) {
242 isDefaultConverter
=FALSE
;
243 ucnv_resetFromUnicode(cnv
);
247 int32_t len
=doExtract(0, length(), dest
, destCapacity
, cnv
, errorCode
);
249 // release the converter
250 if(isDefaultConverter
) {
251 u_releaseDefaultConverter(cnv
);
258 UnicodeString::doExtract(int32_t start
, int32_t length
,
259 char *dest
, int32_t destCapacity
,
261 UErrorCode
&errorCode
) const
263 if(U_FAILURE(errorCode
)) {
264 if(destCapacity
!=0) {
270 const UChar
*src
=getArrayStart()+start
, *srcLimit
=src
+length
;
271 char *originalDest
=dest
;
272 const char *destLimit
;
274 if(destCapacity
==0) {
276 } else if(destCapacity
==-1) {
277 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
278 destLimit
=(char*)U_MAX_PTR(dest
);
279 // for NUL-termination, translate into highest int32_t
280 destCapacity
=0x7fffffff;
282 destLimit
=dest
+destCapacity
;
285 // perform the conversion
286 ucnv_fromUnicode(cnv
, &dest
, destLimit
, &src
, srcLimit
, 0, TRUE
, &errorCode
);
287 length
=(int32_t)(dest
-originalDest
);
289 // if an overflow occurs, then get the preflighting length
290 if(errorCode
==U_BUFFER_OVERFLOW_ERROR
) {
293 destLimit
=buffer
+sizeof(buffer
);
296 errorCode
=U_ZERO_ERROR
;
297 ucnv_fromUnicode(cnv
, &dest
, destLimit
, &src
, srcLimit
, 0, TRUE
, &errorCode
);
298 length
+=(int32_t)(dest
-buffer
);
299 } while(errorCode
==U_BUFFER_OVERFLOW_ERROR
);
302 return u_terminateChars(originalDest
, destCapacity
, length
, &errorCode
);
306 UnicodeString::doCodepageCreate(const char *codepageData
,
308 const char *codepage
)
310 // if there's nothing to convert, do nothing
311 if(codepageData
== 0 || dataLength
== 0 || dataLength
< -1) {
314 if(dataLength
== -1) {
315 dataLength
= (int32_t)uprv_strlen(codepageData
);
318 UErrorCode status
= U_ZERO_ERROR
;
320 // create the converter
321 // if the codepage is the default, use our cache
322 // if it is an empty string, then use the "invariant character" conversion
323 UConverter
*converter
;
325 const char *defaultName
= ucnv_getDefaultName();
326 if(UCNV_FAST_IS_UTF8(defaultName
)) {
327 setToUTF8(StringPiece(codepageData
, dataLength
));
330 converter
= u_getDefaultConverter(&status
);
331 } else if(*codepage
== 0) {
332 // use the "invariant characters" conversion
333 if(cloneArrayIfNeeded(dataLength
, dataLength
, FALSE
)) {
334 u_charsToUChars(codepageData
, getArrayStart(), dataLength
);
335 setLength(dataLength
);
341 converter
= ucnv_open(codepage
, &status
);
344 // if we failed, set the appropriate flags and return
345 if(U_FAILURE(status
)) {
350 // perform the conversion
351 doCodepageCreate(codepageData
, dataLength
, converter
, status
);
352 if(U_FAILURE(status
)) {
356 // close the converter
358 u_releaseDefaultConverter(converter
);
360 ucnv_close(converter
);
365 UnicodeString::doCodepageCreate(const char *codepageData
,
367 UConverter
*converter
,
370 if(U_FAILURE(status
)) {
374 // set up the conversion parameters
375 const char *mySource
= codepageData
;
376 const char *mySourceEnd
= mySource
+ dataLength
;
377 UChar
*array
, *myTarget
;
379 // estimate the size needed:
381 if(dataLength
<= US_STACKBUF_SIZE
) {
382 // try to use the stack buffer
383 arraySize
= US_STACKBUF_SIZE
;
385 // 1.25 UChar's per source byte should cover most cases
386 arraySize
= dataLength
+ (dataLength
>> 2);
389 // we do not care about the current contents
390 UBool doCopyArray
= FALSE
;
392 if(!cloneArrayIfNeeded(arraySize
, arraySize
, doCopyArray
)) {
397 // perform the conversion
398 array
= getArrayStart();
399 myTarget
= array
+ length();
400 ucnv_toUnicode(converter
, &myTarget
, array
+ getCapacity(),
401 &mySource
, mySourceEnd
, 0, TRUE
, &status
);
403 // update the conversion parameters
404 setLength((int32_t)(myTarget
- array
));
406 // allocate more space and copy data, if needed
407 if(status
== U_BUFFER_OVERFLOW_ERROR
) {
408 // reset the error code
409 status
= U_ZERO_ERROR
;
411 // keep the previous conversion results
414 // estimate the new size needed, larger than before
415 // try 2 UChar's per remaining source byte
416 arraySize
= (int32_t)(length() + 2 * (mySourceEnd
- mySource
));