git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/common/unistr

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	*******************************************************************************
	5	*
	6	* Copyright (C) 1999-2014, International Business Machines
	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
	10	* file name: unistr_cnv.cpp
	11	* encoding: UTF-8
	12	* tab size: 8 (not used)
	13	* indentation:2
	14	*
	15	* created on: 2004aug19
	16	* created by: Markus W. Scherer
	17	*
	18	* Character conversion functions moved here from unistr.cpp
	19	*/
	20
	21	#include "unicode/utypes.h"
	22
	23	#if !UCONFIG_NO_CONVERSION
	24
	25	#include "unicode/putil.h"
	26	#include "cstring.h"
	27	#include "cmemory.h"
	28	#include "unicode/ustring.h"
	29	#include "unicode/unistr.h"
	30	#include "unicode/ucnv.h"
	31	#include "ucnv_imp.h"
	32	#include "putilimp.h"
	33	#include "ustr_cnv.h"
	34	#include "ustr_imp.h"
	35
	36	U_NAMESPACE_BEGIN
	37
	38	//========================================
	39	// Constructors
	40	//========================================
	41
	42	#if !U_CHARSET_IS_UTF8
	43
	44	UnicodeString::UnicodeString(const char *codepageData) {
	45	fUnion.fFields.fLengthAndFlags = kShortString;
	46	if(codepageData != 0) {
	47	doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
	48	}
	49	}
	50
	51	UnicodeString::UnicodeString(const char *codepageData,
	52	int32_t dataLength) {
	53	fUnion.fFields.fLengthAndFlags = kShortString;
	54	if(codepageData != 0) {
	55	doCodepageCreate(codepageData, dataLength, 0);
	56	}
	57	}
	58
	59	// else see unistr.cpp
	60	#endif
	61
	62	UnicodeString::UnicodeString(const char *codepageData,
	63	const char *codepage) {
	64	fUnion.fFields.fLengthAndFlags = kShortString;
	65	if(codepageData != 0) {
	66	doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
	67	}
	68	}
	69
	70	UnicodeString::UnicodeString(const char *codepageData,
	71	int32_t dataLength,
	72	const char *codepage) {
	73	fUnion.fFields.fLengthAndFlags = kShortString;
	74	if(codepageData != 0) {
	75	doCodepageCreate(codepageData, dataLength, codepage);
	76	}
	77	}
	78
	79	UnicodeString::UnicodeString(const char *src, int32_t srcLength,
	80	UConverter *cnv,
	81	UErrorCode &errorCode) {
	82	fUnion.fFields.fLengthAndFlags = kShortString;
	83	if(U_SUCCESS(errorCode)) {
	84	// check arguments
	85	if(src==NULL) {
	86	// treat as an empty string, do nothing more
	87	} else if(srcLength<-1) {
	88	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	89	} else {
	90	// get input length
	91	if(srcLength==-1) {
	92	srcLength=(int32_t)uprv_strlen(src);
	93	}
	94	if(srcLength>0) {
	95	if(cnv!=0) {
	96	// use the provided converter
	97	ucnv_resetToUnicode(cnv);
	98	doCodepageCreate(src, srcLength, cnv, errorCode);
	99	} else {
	100	// use the default converter
	101	cnv=u_getDefaultConverter(&errorCode);
	102	doCodepageCreate(src, srcLength, cnv, errorCode);
	103	u_releaseDefaultConverter(cnv);
	104	}
	105	}
	106	}
	107
	108	if(U_FAILURE(errorCode)) {
	109	setToBogus();
	110	}
	111	}
	112	}
	113
	114	//========================================
	115	// Codeset conversion
	116	//========================================
	117
	118	#if !U_CHARSET_IS_UTF8
	119
	120	int32_t
	121	UnicodeString::extract(int32_t start,
	122	int32_t length,
	123	char *target,
	124	uint32_t dstSize) const {
	125	return extract(start, length, target, dstSize, 0);
	126	}
	127
	128	// else see unistr.cpp
	129	#endif
	130
	131	int32_t
	132	UnicodeString::extract(int32_t start,
	133	int32_t length,
	134	char *target,
	135	uint32_t dstSize,
	136	const char *codepage) const
	137	{
	138	// if the arguments are illegal, then do nothing
	139	if(/dstSize < 0 \|\| /(dstSize > 0 && target == 0)) {
	140	return 0;
	141	}
	142
	143	// pin the indices to legal values
	144	pinIndices(start, length);
	145
	146	// We need to cast dstSize to int32_t for all subsequent code.
	147	// I don't know why the API was defined with uint32_t but we are stuck with it.
	148	// Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
	149	// as a limit in some functions, it may wrap around and yield a pointer
	150	// that compares less-than target.
	151	int32_t capacity;
	152	if(dstSize < 0x7fffffff) {
	153	// Assume that the capacity is real and a limit pointer won't wrap around.
	154	capacity = (int32_t)dstSize;
	155	} else {
	156	// Pin the capacity so that a limit pointer does not wrap around.
	157	char targetLimit = (char )U_MAX_PTR(target);
	158	// U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
	159	// greater than target and does not wrap around the top of the address space.
	160	capacity = (int32_t)(targetLimit - target);
	161	}
	162
	163	// create the converter
	164	UConverter *converter;
	165	UErrorCode status = U_ZERO_ERROR;
	166
	167	// just write the NUL if the string length is 0
	168	if(length == 0) {
	169	return u_terminateChars(target, capacity, 0, &status);
	170	}
	171
	172	// if the codepage is the default, use our cache
	173	// if it is an empty string, then use the "invariant character" conversion
	174	if (codepage == 0) {
	175	const char *defaultName = ucnv_getDefaultName();
	176	if(UCNV_FAST_IS_UTF8(defaultName)) {
	177	return toUTF8(start, length, target, capacity);
	178	}
	179	converter = u_getDefaultConverter(&status);
	180	} else if (*codepage == 0) {
	181	// use the "invariant characters" conversion
	182	int32_t destLength;
	183	if(length <= capacity) {
	184	destLength = length;
	185	} else {
	186	destLength = capacity;
	187	}
	188	u_UCharsToChars(getArrayStart() + start, target, destLength);
	189	return u_terminateChars(target, capacity, length, &status);
	190	} else {
	191	converter = ucnv_open(codepage, &status);
	192	}
	193
	194	length = doExtract(start, length, target, capacity, converter, status);
	195
	196	// close the converter
	197	if (codepage == 0) {
	198	u_releaseDefaultConverter(converter);
	199	} else {
	200	ucnv_close(converter);
	201	}
	202
	203	return length;
	204	}
	205
	206	int32_t
	207	UnicodeString::extract(char *dest, int32_t destCapacity,
	208	UConverter *cnv,
	209	UErrorCode &errorCode) const
	210	{
	211	if(U_FAILURE(errorCode)) {
	212	return 0;
	213	}
	214
	215	if(isBogus() \|\| destCapacity<0 \|\| (destCapacity>0 && dest==0)) {
	216	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
	217	return 0;
	218	}
	219
	220	// nothing to do?
	221	if(isEmpty()) {
	222	return u_terminateChars(dest, destCapacity, 0, &errorCode);
	223	}
	224
	225	// get the converter
	226	UBool isDefaultConverter;
	227	if(cnv==0) {
	228	isDefaultConverter=TRUE;
	229	cnv=u_getDefaultConverter(&errorCode);
	230	if(U_FAILURE(errorCode)) {
	231	return 0;
	232	}
	233	} else {
	234	isDefaultConverter=FALSE;
	235	ucnv_resetFromUnicode(cnv);
	236	}
	237
	238	// convert
	239	int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
	240
	241	// release the converter
	242	if(isDefaultConverter) {
	243	u_releaseDefaultConverter(cnv);
	244	}
	245
	246	return len;
	247	}
	248
	249	int32_t
	250	UnicodeString::doExtract(int32_t start, int32_t length,
	251	char *dest, int32_t destCapacity,
	252	UConverter *cnv,
	253	UErrorCode &errorCode) const
	254	{
	255	if(U_FAILURE(errorCode)) {
	256	if(destCapacity!=0) {
	257	*dest=0;
	258	}
	259	return 0;
	260	}
	261
	262	const UChar src=getArrayStart()+start, srcLimit=src+length;
	263	char *originalDest=dest;
	264	const char *destLimit;
	265
	266	if(destCapacity==0) {
	267	destLimit=dest=0;
	268	} else if(destCapacity==-1) {
	269	// Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
	270	destLimit=(char*)U_MAX_PTR(dest);
	271	// for NUL-termination, translate into highest int32_t
	272	destCapacity=0x7fffffff;
	273	} else {
	274	destLimit=dest+destCapacity;
	275	}
	276
	277	// perform the conversion
	278	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
	279	length=(int32_t)(dest-originalDest);
	280
	281	// if an overflow occurs, then get the preflighting length
	282	if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
	283	char buffer[1024];
	284
	285	destLimit=buffer+sizeof(buffer);
	286	do {
	287	dest=buffer;
	288	errorCode=U_ZERO_ERROR;
	289	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
	290	length+=(int32_t)(dest-buffer);
	291	} while(errorCode==U_BUFFER_OVERFLOW_ERROR);
	292	}
	293
	294	return u_terminateChars(originalDest, destCapacity, length, &errorCode);
	295	}
	296
	297	void
	298	UnicodeString::doCodepageCreate(const char *codepageData,
	299	int32_t dataLength,
	300	const char *codepage)
	301	{
	302	// if there's nothing to convert, do nothing
	303	if(codepageData == 0 \|\| dataLength == 0 \|\| dataLength < -1) {
	304	return;
	305	}
	306	if(dataLength == -1) {
	307	dataLength = (int32_t)uprv_strlen(codepageData);
	308	}
	309
	310	UErrorCode status = U_ZERO_ERROR;
	311
	312	// create the converter
	313	// if the codepage is the default, use our cache
	314	// if it is an empty string, then use the "invariant character" conversion
	315	UConverter *converter;
	316	if (codepage == 0) {
	317	const char *defaultName = ucnv_getDefaultName();
	318	if(UCNV_FAST_IS_UTF8(defaultName)) {
	319	setToUTF8(StringPiece(codepageData, dataLength));
	320	return;
	321	}
	322	converter = u_getDefaultConverter(&status);
	323	} else if(*codepage == 0) {
	324	// use the "invariant characters" conversion
	325	if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
	326	u_charsToUChars(codepageData, getArrayStart(), dataLength);
	327	setLength(dataLength);
	328	} else {
	329	setToBogus();
	330	}
	331	return;
	332	} else {
	333	converter = ucnv_open(codepage, &status);
	334	}
	335
	336	// if we failed, set the appropriate flags and return
	337	if(U_FAILURE(status)) {
	338	setToBogus();
	339	return;
	340	}
	341
	342	// perform the conversion
	343	doCodepageCreate(codepageData, dataLength, converter, status);
	344	if(U_FAILURE(status)) {
	345	setToBogus();
	346	}
	347
	348	// close the converter
	349	if(codepage == 0) {
	350	u_releaseDefaultConverter(converter);
	351	} else {
	352	ucnv_close(converter);
	353	}
	354	}
	355
	356	void
	357	UnicodeString::doCodepageCreate(const char *codepageData,
	358	int32_t dataLength,
	359	UConverter *converter,
	360	UErrorCode &status)
	361	{
	362	if(U_FAILURE(status)) {
	363	return;
	364	}
	365
	366	// set up the conversion parameters
	367	const char *mySource = codepageData;
	368	const char *mySourceEnd = mySource + dataLength;
	369	UChar array, myTarget;
	370
	371	// estimate the size needed:
	372	int32_t arraySize;
	373	if(dataLength <= US_STACKBUF_SIZE) {
	374	// try to use the stack buffer
	375	arraySize = US_STACKBUF_SIZE;
	376	} else {
	377	// 1.25 UChar's per source byte should cover most cases
	378	arraySize = dataLength + (dataLength >> 2);
	379	}
	380
	381	// we do not care about the current contents
	382	UBool doCopyArray = FALSE;
	383	for(;;) {
	384	if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
	385	setToBogus();
	386	break;
	387	}
	388
	389	// perform the conversion
	390	array = getArrayStart();
	391	myTarget = array + length();
	392	ucnv_toUnicode(converter, &myTarget, array + getCapacity(),
	393	&mySource, mySourceEnd, 0, TRUE, &status);
	394
	395	// update the conversion parameters
	396	setLength((int32_t)(myTarget - array));
	397
	398	// allocate more space and copy data, if needed
	399	if(status == U_BUFFER_OVERFLOW_ERROR) {
	400	// reset the error code
	401	status = U_ZERO_ERROR;
	402
	403	// keep the previous conversion results
	404	doCopyArray = TRUE;
	405
	406	// estimate the new size needed, larger than before
	407	// try 2 UChar's per remaining source byte
	408	arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
	409	} else {
	410	break;
	411	}
	412	}
	413	}
	414
	415	U_NAMESPACE_END
	416
	417	#endif