[apple/icu.git] / icuSources / common / unistr_cnv.cpp

/*
*******************************************************************************
*
*   Copyright (C) 1999-2006, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  unistr_cnv.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:2
*
*   created on: 2004aug19
*   created by: Markus W. Scherer
*
*   Character conversion functions moved here from unistr.cpp
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "unicode/putil.h"
#include "cstring.h"
#include "cmemory.h"
#include "unicode/ustring.h"
#include "unicode/unistr.h"
#include "unicode/ucnv.h"
#include "putilimp.h"
#include "ustr_cnv.h"
#include "ustr_imp.h"

U_NAMESPACE_BEGIN

//========================================
// Constructors
//========================================

UnicodeString::UnicodeString(const char *codepageData,
                             const char *codepage)
  : fLength(0),
    fCapacity(US_STACKBUF_SIZE),
    fArray(fStackBuffer),
    fFlags(kShortString)
{
    if(codepageData != 0) {
        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
    }
}


UnicodeString::UnicodeString(const char *codepageData,
                             int32_t dataLength,
                             const char *codepage)
  : fLength(0),
    fCapacity(US_STACKBUF_SIZE),
    fArray(fStackBuffer),
    fFlags(kShortString)
{
    if(codepageData != 0) {
        doCodepageCreate(codepageData, dataLength, codepage);
    }
}

UnicodeString::UnicodeString(const char *src, int32_t srcLength,
                             UConverter *cnv,
                             UErrorCode &errorCode)
  : fLength(0),
    fCapacity(US_STACKBUF_SIZE),
    fArray(fStackBuffer),
    fFlags(kShortString)
{
    if(U_SUCCESS(errorCode)) {
        // check arguments
        if(src==NULL) {
            // treat as an empty string, do nothing more
        } else if(srcLength<-1) {
            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
        } else {
            // get input length
            if(srcLength==-1) {
                srcLength=(int32_t)uprv_strlen(src);
            }
            if(srcLength>0) {
                if(cnv!=0) {
                    // use the provided converter
                    ucnv_resetToUnicode(cnv);
                    doCodepageCreate(src, srcLength, cnv, errorCode);
                } else {
                    // use the default converter
                    cnv=u_getDefaultConverter(&errorCode);
                    doCodepageCreate(src, srcLength, cnv, errorCode);
                    u_releaseDefaultConverter(cnv);
                }
            }
        }

        if(U_FAILURE(errorCode)) {
            setToBogus();
        }
    }
}

//========================================
// Codeset conversion
//========================================
int32_t
UnicodeString::extract(int32_t start,
                       int32_t length,
                       char *target,
                       uint32_t dstSize,
                       const char *codepage) const
{
    // if the arguments are illegal, then do nothing
    if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
        return 0;
    }

    // pin the indices to legal values
    pinIndices(start, length);

    // create the converter
    UConverter *converter;
    UErrorCode status = U_ZERO_ERROR;

    // just write the NUL if the string length is 0
    if(length == 0) {
        if(dstSize >= 0x80000000) {  
            // careful: dstSize is unsigned! (0xffffffff means "unlimited")
            // make sure that the NUL-termination works (takes int32_t)
            dstSize=0x7fffffff;
        }
        return u_terminateChars(target, dstSize, 0, &status);
    }

    // if the codepage is the default, use our cache
    // if it is an empty string, then use the "invariant character" conversion
    if (codepage == 0) {
        converter = u_getDefaultConverter(&status);
    } else if (*codepage == 0) {
        // use the "invariant characters" conversion
        int32_t destLength;
        // careful: dstSize is unsigned! (0xffffffff means "unlimited")
        if(dstSize >= 0x80000000) {
            destLength = length;
            // make sure that the NUL-termination works (takes int32_t)
            dstSize=0x7fffffff;
        } else if(length <= (int32_t)dstSize) {
            destLength = length;
        } else {
            destLength = (int32_t)dstSize;
        }
        u_UCharsToChars(getArrayStart() + start, target, destLength);
        return u_terminateChars(target, (int32_t)dstSize, length, &status);
    } else {
        converter = ucnv_open(codepage, &status);
    }

    length = doExtract(start, length, target, (int32_t)dstSize, converter, status);

    // close the converter
    if (codepage == 0) {
        u_releaseDefaultConverter(converter);
    } else {
        ucnv_close(converter);
    }

    return length;
}

int32_t
UnicodeString::extract(char *dest, int32_t destCapacity,
                       UConverter *cnv,
                       UErrorCode &errorCode) const
{
    if(U_FAILURE(errorCode)) {
        return 0;
    }

    if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    // nothing to do?
    if(fLength<=0) {
        return u_terminateChars(dest, destCapacity, 0, &errorCode);
    }

    // get the converter
    UBool isDefaultConverter;
    if(cnv==0) {
        isDefaultConverter=TRUE;
        cnv=u_getDefaultConverter(&errorCode);
        if(U_FAILURE(errorCode)) {
            return 0;
        }
    } else {
        isDefaultConverter=FALSE;
        ucnv_resetFromUnicode(cnv);
    }

    // convert
    int32_t length=doExtract(0, fLength, dest, destCapacity, cnv, errorCode);

    // release the converter
    if(isDefaultConverter) {
        u_releaseDefaultConverter(cnv);
    }

    return length;
}

int32_t
UnicodeString::doExtract(int32_t start, int32_t length,
                         char *dest, int32_t destCapacity,
                         UConverter *cnv,
                         UErrorCode &errorCode) const
{
    if(U_FAILURE(errorCode)) {
        if(destCapacity!=0) {
            *dest=0;
        }
        return 0;
    }

    const UChar *src=fArray+start, *srcLimit=src+length;
    char *originalDest=dest;
    const char *destLimit;

    if(destCapacity==0) {
        destLimit=dest=0;
    } else if(destCapacity==-1) {
        // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
        destLimit=(char*)U_MAX_PTR(dest);
        // for NUL-termination, translate into highest int32_t
        destCapacity=0x7fffffff;
    } else {
        destLimit=dest+destCapacity;
    }

    // perform the conversion
    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
    length=(int32_t)(dest-originalDest);

    // if an overflow occurs, then get the preflighting length
    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
        char buffer[1024];

        destLimit=buffer+sizeof(buffer);
        do {
            dest=buffer;
            errorCode=U_ZERO_ERROR;
            ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
            length+=(int32_t)(dest-buffer);
        } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
    }

    return u_terminateChars(originalDest, destCapacity, length, &errorCode);
}

void
UnicodeString::doCodepageCreate(const char *codepageData,
                                int32_t dataLength,
                                const char *codepage)
{
    // if there's nothing to convert, do nothing
    if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
        return;
    }
    if(dataLength == -1) {
        dataLength = (int32_t)uprv_strlen(codepageData);
    }

    UErrorCode status = U_ZERO_ERROR;

    // create the converter
    // if the codepage is the default, use our cache
    // if it is an empty string, then use the "invariant character" conversion
    UConverter *converter = (codepage == 0 ?
                             u_getDefaultConverter(&status) :
                             *codepage == 0 ?
                               0 :
                               ucnv_open(codepage, &status));

    // if we failed, set the appropriate flags and return
    if(U_FAILURE(status)) {
        setToBogus();
        return;
    }

    // perform the conversion
    if(converter == 0) {
        // use the "invariant characters" conversion
        if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
            u_charsToUChars(codepageData, getArrayStart(), dataLength);
            fLength = dataLength;
        } else {
            setToBogus();
        }
        return;
    }

    // convert using the real converter
    doCodepageCreate(codepageData, dataLength, converter, status);
    if(U_FAILURE(status)) {
        setToBogus();
    }

    // close the converter
    if(codepage == 0) {
        u_releaseDefaultConverter(converter);
    } else {
        ucnv_close(converter);
    }
}

void
UnicodeString::doCodepageCreate(const char *codepageData,
                                int32_t dataLength,
                                UConverter *converter,
                                UErrorCode &status)
{
    if(U_FAILURE(status)) {
        return;
    }

    // set up the conversion parameters
    const char *mySource     = codepageData;
    const char *mySourceEnd  = mySource + dataLength;
    UChar *myTarget;

    // estimate the size needed:
    // 1.25 UChar's per source byte should cover most cases
    int32_t arraySize = dataLength + (dataLength >> 2);

    // we do not care about the current contents
    UBool doCopyArray = FALSE;
    for(;;) {
        if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
            setToBogus();
            break;
        }

        // perform the conversion
        myTarget = fArray + fLength;
        ucnv_toUnicode(converter, &myTarget,  fArray + fCapacity,
            &mySource, mySourceEnd, 0, TRUE, &status);

        // update the conversion parameters
        fLength = (int32_t)(myTarget - fArray);

        // allocate more space and copy data, if needed
        if(status == U_BUFFER_OVERFLOW_ERROR) {
            // reset the error code
            status = U_ZERO_ERROR;

            // keep the previous conversion results
            doCopyArray = TRUE;

            // estimate the new size needed, larger than before
            // try 2 UChar's per remaining source byte
            arraySize = (int32_t)(fLength + 2 * (mySourceEnd - mySource));
        } else {
            break;
        }
    }
}

U_NAMESPACE_END

#endif
Commit	Line	Data
374ca955 A	1	/*
	2	*******************************************************************************
	3	*
73c04bcf	4	* Copyright (C) 1999-2006, International Business Machines
374ca955 A	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: unistr_cnv.cpp
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:2
	12	*
	13	* created on: 2004aug19
	14	* created by: Markus W. Scherer
	15	*
	16	* Character conversion functions moved here from unistr.cpp
	17	*/
	18
	19	#include "unicode/utypes.h"
	20
	21	#if !UCONFIG_NO_CONVERSION
	22
	23	#include "unicode/putil.h"
	24	#include "cstring.h"
	25	#include "cmemory.h"
	26	#include "unicode/ustring.h"
	27	#include "unicode/unistr.h"
	28	#include "unicode/ucnv.h"
	29	#include "putilimp.h"
	30	#include "ustr_cnv.h"
	31	#include "ustr_imp.h"
	32
	33	U_NAMESPACE_BEGIN
	34
	35	//========================================
	36	// Constructors
	37	//========================================
	38
	39	UnicodeString::UnicodeString(const char *codepageData,
	40	const char *codepage)
	41	: fLength(0),
	42	fCapacity(US_STACKBUF_SIZE),
	43	fArray(fStackBuffer),
	44	fFlags(kShortString)
	45	{
73c04bcf A	46	if(codepageData != 0) {
	47	doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
	48	}
374ca955 A	49	}
	50
	51
	52	UnicodeString::UnicodeString(const char *codepageData,
	53	int32_t dataLength,
	54	const char *codepage)
	55	: fLength(0),
	56	fCapacity(US_STACKBUF_SIZE),
	57	fArray(fStackBuffer),
	58	fFlags(kShortString)
	59	{
73c04bcf A	60	if(codepageData != 0) {
	61	doCodepageCreate(codepageData, dataLength, codepage);
	62	}
374ca955 A	63	}
	64
	65	UnicodeString::UnicodeString(const char *src, int32_t srcLength,
	66	UConverter *cnv,
	67	UErrorCode &errorCode)
	68	: fLength(0),
	69	fCapacity(US_STACKBUF_SIZE),
	70	fArray(fStackBuffer),
	71	fFlags(kShortString)
	72	{
73c04bcf A	73	if(U_SUCCESS(errorCode)) {
	74	// check arguments
	75	if(src==NULL) {
	76	// treat as an empty string, do nothing more
	77	} else if(srcLength<-1) {
	78	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
374ca955	79	} else {
73c04bcf A	80	// get input length
	81	if(srcLength==-1) {
	82	srcLength=(int32_t)uprv_strlen(src);
	83	}
	84	if(srcLength>0) {
	85	if(cnv!=0) {
	86	// use the provided converter
	87	ucnv_resetToUnicode(cnv);
	88	doCodepageCreate(src, srcLength, cnv, errorCode);
	89	} else {
	90	// use the default converter
	91	cnv=u_getDefaultConverter(&errorCode);
	92	doCodepageCreate(src, srcLength, cnv, errorCode);
	93	u_releaseDefaultConverter(cnv);
	94	}
	95	}
374ca955	96	}
374ca955	97
73c04bcf A	98	if(U_FAILURE(errorCode)) {
	99	setToBogus();
	100	}
374ca955	101	}
374ca955 A	102	}
	103
	104	//========================================
	105	// Codeset conversion
	106	//========================================
	107	int32_t
	108	UnicodeString::extract(int32_t start,
	109	int32_t length,
	110	char *target,
	111	uint32_t dstSize,
	112	const char *codepage) const
	113	{
73c04bcf A	114	// if the arguments are illegal, then do nothing
	115	if(/dstSize < 0 \|\| /(dstSize > 0 && target == 0)) {
	116	return 0;
	117	}
	118
	119	// pin the indices to legal values
	120	pinIndices(start, length);
	121
	122	// create the converter
	123	UConverter *converter;
	124	UErrorCode status = U_ZERO_ERROR;
	125
	126	// just write the NUL if the string length is 0
	127	if(length == 0) {
	128	if(dstSize >= 0x80000000) {
	129	// careful: dstSize is unsigned! (0xffffffff means "unlimited")
	130	// make sure that the NUL-termination works (takes int32_t)
	131	dstSize=0x7fffffff;
	132	}
	133	return u_terminateChars(target, dstSize, 0, &status);
	134	}
	135
	136	// if the codepage is the default, use our cache
	137	// if it is an empty string, then use the "invariant character" conversion
	138	if (codepage == 0) {
	139	converter = u_getDefaultConverter(&status);
	140	} else if (*codepage == 0) {
	141	// use the "invariant characters" conversion
	142	int32_t destLength;
	143	// careful: dstSize is unsigned! (0xffffffff means "unlimited")
	144	if(dstSize >= 0x80000000) {
	145	destLength = length;
	146	// make sure that the NUL-termination works (takes int32_t)
	147	dstSize=0x7fffffff;
	148	} else if(length <= (int32_t)dstSize) {
	149	destLength = length;
	150	} else {
	151	destLength = (int32_t)dstSize;
	152	}
	153	u_UCharsToChars(getArrayStart() + start, target, destLength);
	154	return u_terminateChars(target, (int32_t)dstSize, length, &status);
374ca955	155	} else {
73c04bcf	156	converter = ucnv_open(codepage, &status);
374ca955	157	}
73c04bcf A	158
	159	length = doExtract(start, length, target, (int32_t)dstSize, converter, status);
	160
	161	// close the converter
	162	if (codepage == 0) {
	163	u_releaseDefaultConverter(converter);
	164	} else {
	165	ucnv_close(converter);
	166	}
	167
	168	return length;
374ca955 A	169	}
	170
	171	int32_t
	172	UnicodeString::extract(char *dest, int32_t destCapacity,
	173	UConverter *cnv,
73c04bcf A	174	UErrorCode &errorCode) const
73c04bcf A	175	{
374ca955	176	if(U_FAILURE(errorCode)) {
73c04bcf	177	return 0;
374ca955	178	}
374ca955	179
73c04bcf A	180	if(isBogus() \|\| destCapacity<0 \|\| (destCapacity>0 && dest==0)) {
	181	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	182	return 0;
	183	}
374ca955	184
73c04bcf A	185	// nothing to do?
	186	if(fLength<=0) {
	187	return u_terminateChars(dest, destCapacity, 0, &errorCode);
	188	}
374ca955	189
73c04bcf A	190	// get the converter
	191	UBool isDefaultConverter;
	192	if(cnv==0) {
	193	isDefaultConverter=TRUE;
	194	cnv=u_getDefaultConverter(&errorCode);
	195	if(U_FAILURE(errorCode)) {
	196	return 0;
	197	}
	198	} else {
	199	isDefaultConverter=FALSE;
	200	ucnv_resetFromUnicode(cnv);
	201	}
	202
	203	// convert
	204	int32_t length=doExtract(0, fLength, dest, destCapacity, cnv, errorCode);
	205
	206	// release the converter
	207	if(isDefaultConverter) {
	208	u_releaseDefaultConverter(cnv);
	209	}
	210
	211	return length;
374ca955 A	212	}
	213
	214	int32_t
	215	UnicodeString::doExtract(int32_t start, int32_t length,
	216	char *dest, int32_t destCapacity,
	217	UConverter *cnv,
73c04bcf A	218	UErrorCode &errorCode) const
	219	{
	220	if(U_FAILURE(errorCode)) {
	221	if(destCapacity!=0) {
	222	*dest=0;
	223	}
	224	return 0;
	225	}
	226
	227	const UChar src=fArray+start, srcLimit=src+length;
	228	char *originalDest=dest;
	229	const char *destLimit;
	230
	231	if(destCapacity==0) {
	232	destLimit=dest=0;
	233	} else if(destCapacity==-1) {
	234	// Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
	235	destLimit=(char*)U_MAX_PTR(dest);
	236	// for NUL-termination, translate into highest int32_t
	237	destCapacity=0x7fffffff;
	238	} else {
	239	destLimit=dest+destCapacity;
	240	}
	241
	242	// perform the conversion
	243	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
	244	length=(int32_t)(dest-originalDest);
	245
	246	// if an overflow occurs, then get the preflighting length
	247	if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
	248	char buffer[1024];
	249
	250	destLimit=buffer+sizeof(buffer);
	251	do {
	252	dest=buffer;
	253	errorCode=U_ZERO_ERROR;
	254	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
	255	length+=(int32_t)(dest-buffer);
	256	} while(errorCode==U_BUFFER_OVERFLOW_ERROR);
374ca955	257	}
73c04bcf A	258
73c04bcf A	259	return u_terminateChars(originalDest, destCapacity, length, &errorCode);
374ca955 A	260	}
	261
	262	void
	263	UnicodeString::doCodepageCreate(const char *codepageData,
73c04bcf A	264	int32_t dataLength,
73c04bcf A	265	const char *codepage)
374ca955	266	{
73c04bcf A	267	// if there's nothing to convert, do nothing
	268	if(codepageData == 0 \|\| dataLength == 0 \|\| dataLength < -1) {
	269	return;
	270	}
	271	if(dataLength == -1) {
	272	dataLength = (int32_t)uprv_strlen(codepageData);
	273	}
	274
	275	UErrorCode status = U_ZERO_ERROR;
	276
	277	// create the converter
	278	// if the codepage is the default, use our cache
	279	// if it is an empty string, then use the "invariant character" conversion
	280	UConverter *converter = (codepage == 0 ?
374ca955 A	281	u_getDefaultConverter(&status) :
	282	*codepage == 0 ?
	283	0 :
	284	ucnv_open(codepage, &status));
	285
73c04bcf A	286	// if we failed, set the appropriate flags and return
	287	if(U_FAILURE(status)) {
	288	setToBogus();
	289	return;
	290	}
	291
	292	// perform the conversion
	293	if(converter == 0) {
	294	// use the "invariant characters" conversion
	295	if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
	296	u_charsToUChars(codepageData, getArrayStart(), dataLength);
	297	fLength = dataLength;
	298	} else {
	299	setToBogus();
	300	}
	301	return;
	302	}
	303
	304	// convert using the real converter
	305	doCodepageCreate(codepageData, dataLength, converter, status);
	306	if(U_FAILURE(status)) {
	307	setToBogus();
	308	}
	309
	310	// close the converter
	311	if(codepage == 0) {
	312	u_releaseDefaultConverter(converter);
374ca955	313	} else {
73c04bcf	314	ucnv_close(converter);
374ca955	315	}
374ca955 A	316	}
	317
	318	void
	319	UnicodeString::doCodepageCreate(const char *codepageData,
	320	int32_t dataLength,
	321	UConverter *converter,
73c04bcf A	322	UErrorCode &status)
	323	{
	324	if(U_FAILURE(status)) {
	325	return;
374ca955 A	326	}
374ca955 A	327
73c04bcf A	328	// set up the conversion parameters
	329	const char *mySource = codepageData;
	330	const char *mySourceEnd = mySource + dataLength;
	331	UChar *myTarget;
	332
	333	// estimate the size needed:
	334	// 1.25 UChar's per source byte should cover most cases
	335	int32_t arraySize = dataLength + (dataLength >> 2);
	336
	337	// we do not care about the current contents
	338	UBool doCopyArray = FALSE;
	339	for(;;) {
	340	if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
	341	setToBogus();
	342	break;
	343	}
374ca955	344
73c04bcf A	345	// perform the conversion
	346	myTarget = fArray + fLength;
	347	ucnv_toUnicode(converter, &myTarget, fArray + fCapacity,
	348	&mySource, mySourceEnd, 0, TRUE, &status);
374ca955	349
73c04bcf A	350	// update the conversion parameters
73c04bcf A	351	fLength = (int32_t)(myTarget - fArray);
374ca955	352
73c04bcf A	353	// allocate more space and copy data, if needed
	354	if(status == U_BUFFER_OVERFLOW_ERROR) {
	355	// reset the error code
	356	status = U_ZERO_ERROR;
374ca955	357
73c04bcf A	358	// keep the previous conversion results
	359	doCopyArray = TRUE;
	360
	361	// estimate the new size needed, larger than before
	362	// try 2 UChar's per remaining source byte
	363	arraySize = (int32_t)(fLength + 2 * (mySourceEnd - mySource));
	364	} else {
	365	break;
	366	}
374ca955	367	}
374ca955 A	368	}
	369
	370	U_NAMESPACE_END
	371
	372	#endif