git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2005-2016, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: ucasemap.cpp
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2005may06
	14	* created by: Markus W. Scherer
	15	*
	16	* Case mapping service object and functions using it.
	17	*/
	18
	19	#include "unicode/utypes.h"
	20	#include "unicode/brkiter.h"
	21	#include "unicode/ubrk.h"
	22	#include "unicode/uloc.h"
	23	#include "unicode/ustring.h"
	24	#include "unicode/ucasemap.h"
	25	#if !UCONFIG_NO_BREAK_ITERATION
	26	#include "unicode/utext.h"
	27	#endif
	28	#include "unicode/utf.h"
	29	#include "unicode/utf8.h"
	30	#include "unicode/utf16.h"
	31	#include "cmemory.h"
	32	#include "cstring.h"
	33	#include "ucase.h"
	34	#include "ustr_imp.h"
	35
	36	U_NAMESPACE_USE
	37
	38	/* UCaseMap service object -------------------------------------------------- */
	39
	40	U_CAPI UCaseMap * U_EXPORT2
	41	ucasemap_open(const char locale, uint32_t options, UErrorCode pErrorCode) {
	42	UCaseMap *csm;
	43
	44	if(U_FAILURE(*pErrorCode)) {
	45	return NULL;
	46	}
	47
	48	csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap));
	49	if(csm==NULL) {
	50	return NULL;
	51	}
	52	uprv_memset(csm, 0, sizeof(UCaseMap));
	53
	54	csm->csp=ucase_getSingleton();
	55	ucasemap_setLocale(csm, locale, pErrorCode);
	56	if(U_FAILURE(*pErrorCode)) {
	57	uprv_free(csm);
	58	return NULL;
	59	}
	60
	61	csm->options=options;
	62	return csm;
	63	}
	64
	65	U_CAPI void U_EXPORT2
	66	ucasemap_close(UCaseMap *csm) {
	67	if(csm!=NULL) {
	68	#if !UCONFIG_NO_BREAK_ITERATION
	69	// Do not call ubrk_close() so that we do not depend on all of the BreakIterator code.
	70	delete reinterpret_cast<BreakIterator *>(csm->iter);
	71	#endif
	72	uprv_free(csm);
	73	}
	74	}
	75
	76	U_CAPI const char * U_EXPORT2
	77	ucasemap_getLocale(const UCaseMap *csm) {
	78	return csm->locale;
	79	}
	80
	81	U_CAPI uint32_t U_EXPORT2
	82	ucasemap_getOptions(const UCaseMap *csm) {
	83	return csm->options;
	84	}
	85
	86	U_CAPI void U_EXPORT2
	87	ucasemap_setLocale(UCaseMap csm, const char locale, UErrorCode *pErrorCode) {
	88	int32_t length;
	89
	90	if(U_FAILURE(*pErrorCode)) {
	91	return;
	92	}
	93
	94	length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
	95	if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR \|\| length==sizeof(csm->locale)) {
	96	*pErrorCode=U_ZERO_ERROR;
	97	/* we only really need the language code for case mappings */
	98	length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
	99	}
	100	if(length==sizeof(csm->locale)) {
	101	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
	102	}
	103	csm->locCache=0;
	104	if(U_SUCCESS(*pErrorCode)) {
	105	ucase_getCaseLocale(csm->locale, &csm->locCache);
	106	} else {
	107	csm->locale[0]=0;
	108	}
	109	}
	110
	111	U_CAPI void U_EXPORT2
	112	ucasemap_setOptions(UCaseMap csm, uint32_t options, UErrorCode /pErrorCode/) {
	113	csm->options=options;
	114	}
	115
	116	/* UTF-8 string case mappings ----------------------------------------------- */
	117
	118	/* TODO(markus): Move to a new, separate utf8case.c file. */
	119
	120	/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
	121	static inline int32_t
	122	appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
	123	int32_t result, const UChar *s) {
	124	UChar32 c;
	125	int32_t length;
	126	UErrorCode errorCode;
	127
	128	/* decode the result */
	129	if(result<0) {
	130	/* (not) original code point */
	131	c=~result;
	132	length=U8_LENGTH(c);
	133	} else if(result<=UCASE_MAX_STRING_LENGTH) {
	134	c=U_SENTINEL;
	135	length=result;
	136	} else {
	137	c=result;
	138	length=U8_LENGTH(c);
	139	}
	140	if(length>(INT32_MAX-destIndex)) {
	141	return -1; // integer overflow
	142	}
	143
	144	if(destIndex<destCapacity) {
	145	/* append the result */
	146	if(c>=0) {
	147	/* code point */
	148	UBool isError=FALSE;
	149	U8_APPEND(dest, destIndex, destCapacity, c, isError);
	150	if(isError) {
	151	/* overflow, nothing written */
	152	destIndex+=length;
	153	}
	154	} else {
	155	/* string */
	156	int32_t destLength;
	157	errorCode=U_ZERO_ERROR;
	158	u_strToUTF8(
	159	(char *)(dest+destIndex), destCapacity-destIndex, &destLength,
	160	s, length,
	161	&errorCode);
	162	if(U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) {
	163	return -1;
	164	}
	165	if(destLength>(INT32_MAX-destIndex)) {
	166	return -1; // integer overflow
	167	}
	168	destIndex+=destLength;
	169	/* we might have an overflow, but we know the actual length */
	170	}
	171	} else {
	172	/* preflight */
	173	if(c>=0) {
	174	destIndex+=length;
	175	} else {
	176	int32_t destLength;
	177	errorCode=U_ZERO_ERROR;
	178	u_strToUTF8(
	179	NULL, 0, &destLength,
	180	s, length,
	181	&errorCode);
	182	if(U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) {
	183	return -1;
	184	}
	185	if(destLength>(INT32_MAX-destIndex)) {
	186	return -1; // integer overflow
	187	}
	188	destIndex+=destLength;
	189	}
	190	}
	191	return destIndex;
	192	}
	193
	194	static inline int32_t
	195	appendUChar(uint8_t *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
	196	int32_t length=U8_LENGTH(c);
	197	if(length>(INT32_MAX-destIndex)) {
	198	return -1; // integer overflow
	199	}
	200	int32_t limit=destIndex+length;
	201	if(limit<destCapacity) {
	202	U8_APPEND_UNSAFE(dest, destIndex, c);
	203	}
	204	return limit;
	205	}
	206
	207	static UChar32 U_CALLCONV
	208	utf8_caseContextIterator(void *context, int8_t dir) {
	209	UCaseContext csc=(UCaseContext )context;
	210	UChar32 c;
	211
	212	if(dir<0) {
	213	/* reset for backward iteration */
	214	csc->index=csc->cpStart;
	215	csc->dir=dir;
	216	} else if(dir>0) {
	217	/* reset for forward iteration */
	218	csc->index=csc->cpLimit;
	219	csc->dir=dir;
	220	} else {
	221	/* continue current iteration direction */
	222	dir=csc->dir;
	223	}
	224
	225	if(dir<0) {
	226	if(csc->start<csc->index) {
	227	U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
	228	return c;
	229	}
	230	} else {
	231	if(csc->index<csc->limit) {
	232	U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
	233	return c;
	234	}
	235	}
	236	return U_SENTINEL;
	237	}
	238
	239	/*
	240	* Case-maps [srcStart..srcLimit[ but takes
	241	* context [0..srcLength[ into account.
	242	*/
	243	static int32_t
	244	_caseMap(const UCaseMap csm, UCaseMapFull map,
	245	uint8_t *dest, int32_t destCapacity,
	246	const uint8_t src, UCaseContext csc,
	247	int32_t srcStart, int32_t srcLimit,
	248	UErrorCode *pErrorCode) {
	249	const UChar *s = NULL;
	250	UChar32 c, c2 = 0;
	251	int32_t srcIndex, destIndex;
	252	int32_t locCache;
	253
	254	locCache=csm->locCache;
	255
	256	/* case mapping loop */
	257	srcIndex=srcStart;
	258	destIndex=0;
	259	while(srcIndex<srcLimit) {
	260	csc->cpStart=srcIndex;
	261	U8_NEXT(src, srcIndex, srcLimit, c);
	262	csc->cpLimit=srcIndex;
	263	if(c<0) {
	264	int32_t i=csc->cpStart;
	265	while(destIndex<destCapacity && i<srcIndex) {
	266	dest[destIndex++]=src[i++];
	267	}
	268	continue;
	269	}
	270	c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
	271	if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
	272	/* fast path version of appendResult() for ASCII results */
	273	dest[destIndex++]=(uint8_t)c2;
	274	} else {
	275	destIndex=appendResult(dest, destIndex, destCapacity, c, s);
	276	if(destIndex<0) {
	277	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	278	return 0;
	279	}
	280	}
	281	}
	282
	283	if(destIndex>destCapacity) {
	284	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
	285	}
	286	return destIndex;
	287	}
	288
	289	#if !UCONFIG_NO_BREAK_ITERATION
	290
	291	U_CFUNC int32_t U_CALLCONV
	292	ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
	293	uint8_t *dest, int32_t destCapacity,
	294	const uint8_t *src, int32_t srcLength,
	295	UErrorCode *pErrorCode) {
	296	const UChar *s;
	297	UChar32 c;
	298	int32_t prev, titleStart, titleLimit, idx, destIndex, length;
	299	UBool isFirstIndex;
	300
	301	if(U_FAILURE(*pErrorCode)) {
	302	return 0;
	303	}
	304
	305	// Use the C++ abstract base class to minimize dependencies.
	306	// TODO: Change UCaseMap.iter to store a BreakIterator directly.
	307	BreakIterator bi=reinterpret_cast<BreakIterator >(csm->iter);
	308
	309	/* set up local variables */
	310	int32_t locCache=csm->locCache;
	311	UCaseContext csc=UCASECONTEXT_INITIALIZER;
	312	csc.p=(void *)src;
	313	csc.limit=srcLength;
	314	destIndex=0;
	315	prev=0;
	316	isFirstIndex=TRUE;
	317
	318	/* titlecasing loop */
	319	while(prev<srcLength) {
	320	/* find next index where to titlecase */
	321	if(isFirstIndex) {
	322	isFirstIndex=FALSE;
	323	idx=bi->first();
	324	} else {
	325	idx=bi->next();
	326	}
	327	if(idx==UBRK_DONE \|\| idx>srcLength) {
	328	idx=srcLength;
	329	}
	330
	331	/*
	332	* Unicode 4 & 5 section 3.13 Default Case Operations:
	333	*
	334	* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
	335	* #29, "Text Boundaries." Between each pair of word boundaries, find the first
	336	* cased character F. If F exists, map F to default_title(F); then map each
	337	* subsequent character C to default_lower(C).
	338	*
	339	* In this implementation, segment [prev..index[ into 3 parts:
	340	* a) uncased characters (copy as-is) [prev..titleStart[
	341	* b) first case letter (titlecase) [titleStart..titleLimit[
	342	* c) subsequent characters (lowercase) [titleLimit..index[
	343	*/
	344	if(prev<idx) {
	345	/* find and copy uncased characters [prev..titleStart[ */
	346	titleStart=titleLimit=prev;
	347	U8_NEXT(src, titleLimit, idx, c);
	348	if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
	349	/* Adjust the titlecasing index (titleStart) to the next cased character. */
	350	for(;;) {
	351	titleStart=titleLimit;
	352	if(titleLimit==idx) {
	353	/*
	354	* only uncased characters in [prev..index[
	355	* stop with titleStart==titleLimit==index
	356	*/
	357	break;
	358	}
	359	U8_NEXT(src, titleLimit, idx, c);
	360	if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
	361	break; /* cased letter at [titleStart..titleLimit[ */
	362	}
	363	}
	364	length=titleStart-prev;
	365	if(length>0) {
	366	if((destIndex+length)<=destCapacity) {
	367	uprv_memcpy(dest+destIndex, src+prev, length);
	368	}
	369	destIndex+=length;
	370	}
	371	}
	372
	373	if(titleStart<titleLimit) {
	374	/* titlecase c which is from [titleStart..titleLimit[ */
	375	csc.cpStart=titleStart;
	376	csc.cpLimit=titleLimit;
	377	c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache);
	378	destIndex=appendResult(dest, destIndex, destCapacity, c, s);
	379	if(destIndex<0) {
	380	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	381	return 0;
	382	}
	383
	384	/* Special case Dutch IJ titlecasing */
	385	if (titleStart+1 < idx &&
	386	ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_DUTCH &&
	387	(src[titleStart] == 0x0049 \|\| src[titleStart] == 0x0069) &&
	388	(src[titleStart+1] == 0x004A \|\| src[titleStart+1] == 0x006A)) {
	389	destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
	390	titleLimit++;
	391	}
	392	/* lowercase [titleLimit..index[ */
	393	if(titleLimit<idx) {
	394	if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
	395	/* Normal operation: Lowercase the rest of the word. */
	396	destIndex+=
	397	_caseMap(
	398	csm, ucase_toFullLower,
	399	dest+destIndex, destCapacity-destIndex,
	400	src, &csc,
	401	titleLimit, idx,
	402	pErrorCode);
	403	if(U_FAILURE(*pErrorCode)) {
	404	return destIndex;
	405	}
	406	} else {
	407	/* Optionally just copy the rest of the word unchanged. */
	408	length=idx-titleLimit;
	409	if(length>(INT32_MAX-destIndex)) {
	410	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	411	return 0;
	412	}
	413	if((destIndex+length)<=destCapacity) {
	414	uprv_memcpy(dest+destIndex, src+titleLimit, length);
	415	}
	416	destIndex+=length;
	417	}
	418	}
	419	}
	420	}
	421
	422	prev=idx;
	423	}
	424
	425	if(destIndex>destCapacity) {
	426	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
	427	}
	428	return destIndex;
	429	}
	430
	431	#endif
	432
	433	static int32_t U_CALLCONV
	434	ucasemap_internalUTF8ToLower(const UCaseMap *csm,
	435	uint8_t *dest, int32_t destCapacity,
	436	const uint8_t *src, int32_t srcLength,
	437	UErrorCode *pErrorCode) {
	438	UCaseContext csc=UCASECONTEXT_INITIALIZER;
	439	csc.p=(void *)src;
	440	csc.limit=srcLength;
	441	return _caseMap(
	442	csm, ucase_toFullLower,
	443	dest, destCapacity,
	444	src, &csc, 0, srcLength,
	445	pErrorCode);
	446	}
	447
	448	static int32_t U_CALLCONV
	449	ucasemap_internalUTF8ToUpper(const UCaseMap *csm,
	450	uint8_t *dest, int32_t destCapacity,
	451	const uint8_t *src, int32_t srcLength,
	452	UErrorCode *pErrorCode) {
	453	UCaseContext csc=UCASECONTEXT_INITIALIZER;
	454	csc.p=(void *)src;
	455	csc.limit=srcLength;
	456	return _caseMap(
	457	csm, ucase_toFullUpper,
	458	dest, destCapacity,
	459	src, &csc, 0, srcLength,
	460	pErrorCode);
	461	}
	462
	463	static int32_t
	464	utf8_foldCase(const UCaseProps *csp,
	465	uint8_t *dest, int32_t destCapacity,
	466	const uint8_t *src, int32_t srcLength,
	467	uint32_t options,
	468	UErrorCode *pErrorCode) {
	469	int32_t srcIndex, destIndex;
	470
	471	const UChar *s;
	472	UChar32 c, c2;
	473	int32_t start;
	474
	475	/* case mapping loop */
	476	srcIndex=destIndex=0;
	477	while(srcIndex<srcLength) {
	478	start=srcIndex;
	479	U8_NEXT(src, srcIndex, srcLength, c);
	480	if(c<0) {
	481	while(destIndex<destCapacity && start<srcIndex) {
	482	dest[destIndex++]=src[start++];
	483	}
	484	continue;
	485	}
	486	c=ucase_toFullFolding(csp, c, &s, options);
	487	if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
	488	/* fast path version of appendResult() for ASCII results */
	489	dest[destIndex++]=(uint8_t)c2;
	490	} else {
	491	destIndex=appendResult(dest, destIndex, destCapacity, c, s);
	492	if(destIndex<0) {
	493	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	494	return 0;
	495	}
	496	}
	497	}
	498
	499	if(destIndex>destCapacity) {
	500	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
	501	}
	502	return destIndex;
	503	}
	504
	505	static int32_t U_CALLCONV
	506	ucasemap_internalUTF8Fold(const UCaseMap *csm,
	507	uint8_t *dest, int32_t destCapacity,
	508	const uint8_t *src, int32_t srcLength,
	509	UErrorCode *pErrorCode) {
	510	return utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
	511	}
	512
	513	U_CFUNC int32_t
	514	ucasemap_mapUTF8(const UCaseMap *csm,
	515	uint8_t *dest, int32_t destCapacity,
	516	const uint8_t *src, int32_t srcLength,
	517	UTF8CaseMapper *stringCaseMapper,
	518	UErrorCode *pErrorCode) {
	519	int32_t destLength;
	520
	521	/* check argument values */
	522	if(U_FAILURE(*pErrorCode)) {
	523	return 0;
	524	}
	525	if( destCapacity<0 \|\|
	526	(dest==NULL && destCapacity>0) \|\|
	527	src==NULL \|\|
	528	srcLength<-1
	529	) {
	530	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	531	return 0;
	532	}
	533
	534	/* get the string length */
	535	if(srcLength==-1) {
	536	srcLength=(int32_t)uprv_strlen((const char *)src);
	537	}
	538
	539	/* check for overlapping source and destination */
	540	if( dest!=NULL &&
	541	((src>=dest && src<(dest+destCapacity)) \|\|
	542	(dest>=src && dest<(src+srcLength)))
	543	) {
	544	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	545	return 0;
	546	}
	547
	548	destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, pErrorCode);
	549	return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode);
	550	}
	551
	552	/* public API functions */
	553
	554	U_CAPI int32_t U_EXPORT2
	555	ucasemap_utf8ToLower(const UCaseMap *csm,
	556	char *dest, int32_t destCapacity,
	557	const char *src, int32_t srcLength,
	558	UErrorCode *pErrorCode) {
	559	return ucasemap_mapUTF8(csm,
	560	(uint8_t *)dest, destCapacity,
	561	(const uint8_t *)src, srcLength,
	562	ucasemap_internalUTF8ToLower, pErrorCode);
	563	}
	564
	565	U_CAPI int32_t U_EXPORT2
	566	ucasemap_utf8ToUpper(const UCaseMap *csm,
	567	char *dest, int32_t destCapacity,
	568	const char *src, int32_t srcLength,
	569	UErrorCode *pErrorCode) {
	570	return ucasemap_mapUTF8(csm,
	571	(uint8_t *)dest, destCapacity,
	572	(const uint8_t *)src, srcLength,
	573	ucasemap_internalUTF8ToUpper, pErrorCode);
	574	}
	575
	576	U_CAPI int32_t U_EXPORT2
	577	ucasemap_utf8FoldCase(const UCaseMap *csm,
	578	char *dest, int32_t destCapacity,
	579	const char *src, int32_t srcLength,
	580	UErrorCode *pErrorCode) {
	581	return ucasemap_mapUTF8(csm,
	582	(uint8_t *)dest, destCapacity,
	583	(const uint8_t *)src, srcLength,
	584	ucasemap_internalUTF8Fold, pErrorCode);
	585	}