git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/samples/uciter8/uit

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2003, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: uit_len8.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2003feb10
	14	* created by: Markus W. Scherer
	15	*
	16	* This file contains the implementation of the "lenient UTF-8" UCharIterator
	17	* as used in the uciter8 sample code.
	18	* UTF-8-style macros are defined as well as the UCharIterator.
	19	* The macros are incomplete (do not assemble code points from pairs of
	20	* surrogates, see comment below)
	21	* but sufficient for the iterator.
	22	*/
	23
	24	#include <string.h>
	25	#include "unicode/utypes.h"
	26	#include "unicode/uiter.h"
	27
	28	/* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
	29
	30	/*
	31	* This code leniently reads 8-bit Unicode strings,
	32	* which could contain a mix of UTF-8 and CESU-8.
	33	* More precisely:
	34	* - supplementary code points may be encoded with dedicated 4-byte sequences
	35	* (UTF-8 style)
	36	* - supplementary code points may be encoded with
	37	* pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
	38	* (CESU-8 style)
	39	* - single surrogates are allowed, encoded with their "natural" 3-byte sequences
	40	*
	41	* Limitation:
	42	* Right now, the macros do not attempt to assemble code points from pairs of
	43	* separately encoded surrogates.
	44	* This would not be sufficient for processing based on these macros,
	45	* but it is sufficient for a UCharIterator that returns only UChars anyway.
	46	*
	47	* The code is copied and modified from utf_impl.c and utf8.h.
	48	* The "strict" argument in the implementation functions is completely removed,
	49	* using the "<0" branch from the original code.
	50	* Checks for surrogate code points are removed for the leniency
	51	* described above.
	52	*/
	53
	54	static const UChar32
	55	lenient8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
	56
	57	static UChar32
	58	lenient8_nextCharSafeBody(const uint8_t s, int32_t pi, int32_t length, UChar32 c) {
	59	int32_t i=*pi;
	60	uint8_t count=U8_COUNT_TRAIL_BYTES(c);
	61	if((i)+count<=(length)) {
	62	uint8_t trail, illegal=0;
	63
	64	U8_MASK_LEAD_BYTE((c), count);
	65	/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
	66	switch(count) {
	67	/* each branch falls through to the next one */
	68	case 5:
	69	case 4:
	70	/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
	71	illegal=1;
	72	break;
	73	case 3:
	74	trail=s[(i)++];
	75	(c)=((c)<<6)\|(trail&0x3f);
	76	if(c<0x110) {
	77	illegal\|=(trail&0xc0)^0x80;
	78	} else {
	79	/* code point>0x10ffff, outside Unicode */
	80	illegal=1;
	81	break;
	82	}
	83	case 2:
	84	trail=s[(i)++];
	85	(c)=((c)<<6)\|(trail&0x3f);
	86	illegal\|=(trail&0xc0)^0x80;
	87	case 1:
	88	trail=s[(i)++];
	89	(c)=((c)<<6)\|(trail&0x3f);
	90	illegal\|=(trail&0xc0)^0x80;
	91	break;
	92	case 0:
	93	return U_SENTINEL;
	94	/* no default branch to optimize switch() - all values are covered */
	95	}
	96
	97	/* correct sequence - all trail bytes have (b7..b6)==(10)? */
	98	/* illegal is also set if count>=4 */
	99	if(illegal \|\| (c)<lenient8_minLegal[count]) {
	100	/* error handling */
	101	uint8_t errorCount=count;
	102	/* don't go beyond this sequence */
	103	i=*pi;
	104	while(count>0 && U8_IS_TRAIL(s[i])) {
	105	++(i);
	106	--count;
	107	}
	108	c=U_SENTINEL;
	109	}
	110	} else /* too few bytes left */ {
	111	/* error handling */
	112	int32_t i0=i;
	113	/* don't just set (i)=(length) in case there is an illegal sequence */
	114	while((i)<(length) && U8_IS_TRAIL(s[i])) {
	115	++(i);
	116	}
	117	c=U_SENTINEL;
	118	}
	119	*pi=i;
	120	return c;
	121	}
	122
	123	static UChar32
	124	lenient8_prevCharSafeBody(const uint8_t s, int32_t start, int32_t pi, UChar32 c) {
	125	int32_t i=*pi;
	126	uint8_t b, count=1, shift=6;
	127
	128	/* extract value bits from the last trail byte */
	129	c&=0x3f;
	130
	131	for(;;) {
	132	if(i<=start) {
	133	/* no lead byte at all */
	134	return U_SENTINEL;
	135	}
	136
	137	/* read another previous byte */
	138	b=s[--i];
	139	if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
	140	if(b&0x40) {
	141	/* lead byte, this will always end the loop */
	142	uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
	143
	144	if(count==shouldCount) {
	145	/* set the new position */
	146	*pi=i;
	147	U8_MASK_LEAD_BYTE(b, count);
	148	c\|=(UChar32)b<<shift;
	149	if(count>=4 \|\| c>0x10ffff \|\| c<lenient8_minLegal[count]) {
	150	/* illegal sequence */
	151	if(count>=4) {
	152	count=3;
	153	}
	154	c=U_SENTINEL;
	155	} else {
	156	/* exit with correct c */
	157	}
	158	} else {
	159	/* the lead byte does not match the number of trail bytes */
	160	/* only set the position to the lead byte if it would
	161	include the trail byte that we started with */
	162	if(count<shouldCount) {
	163	*pi=i;
	164	}
	165	c=U_SENTINEL;
	166	}
	167	break;
	168	} else if(count<5) {
	169	/* trail byte */
	170	c\|=(UChar32)(b&0x3f)<<shift;
	171	++count;
	172	shift+=6;
	173	} else {
	174	/* more than 5 trail bytes is illegal */
	175	c=U_SENTINEL;
	176	break;
	177	}
	178	} else {
	179	/* single-byte character precedes trailing bytes */
	180	c=U_SENTINEL;
	181	break;
	182	}
	183	}
	184	return c;
	185	}
	186
	187	#define L8_NEXT(s, i, length, c) { \
	188	(c)=(s)[(i)++]; \
	189	if((c)>=0x80) { \
	190	if(U8_IS_LEAD(c)) { \
	191	(c)=lenient8_nextCharSafeBody(s, &(i), (int32_t)(length), c); \
	192	} else { \
	193	(c)=U_SENTINEL; \
	194	} \
	195	} \
	196	}
	197
	198	#define L8_PREV(s, start, i, c) { \
	199	(c)=(s)[--(i)]; \
	200	if((c)>=0x80) { \
	201	if((c)<=0xbf) { \
	202	(c)=lenient8_prevCharSafeBody(s, start, &(i), c); \
	203	} else { \
	204	(c)=U_SENTINEL; \
	205	} \
	206	} \
	207	}
	208
	209	/* lenient-8 UCharIterator -------------------------------------------------- */
	210
	211	/*
	212	* This is a copy of the UTF-8 UCharIterator in uiter.cpp,
	213	* except that it uses the lenient-8-bit-Unicode macros above.
	214	*/
	215
	216	/*
	217	* Minimal implementation:
	218	* Maintain a single-UChar buffer for an additional surrogate.
	219	* The caller must not modify start and limit because they are used internally.
	220	*
	221	* Use UCharIterator fields as follows:
	222	* context pointer to UTF-8 string
	223	* length UTF-16 length of the string; -1 until lazy evaluation
	224	* start current UTF-8 index
	225	* index current UTF-16 index; may be -1="unknown" after setState()
	226	* limit UTF-8 length of the string
	227	* reservedField supplementary code point
	228	*
	229	* Since UCharIterator delivers 16-bit code units, the iteration can be
	230	* currently in the middle of the byte sequence for a supplementary code point.
	231	* In this case, reservedField will contain that code point and start will
	232	* point to after the corresponding byte sequence. The UTF-16 index will be
	233	* one less than what it would otherwise be corresponding to the UTF-8 index.
	234	* Otherwise, reservedField will be 0.
	235	*/
	236
	237	/*
	238	* Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
	239	* Add implementations that do not call strlen() for iteration but check for NUL.
	240	*/
	241
	242	static int32_t U_CALLCONV
	243	lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
	244	switch(origin) {
	245	case UITER_ZERO:
	246	case UITER_START:
	247	return 0;
	248	case UITER_CURRENT:
	249	if(iter->index<0) {
	250	/* the current UTF-16 index is unknown after setState(), count from the beginning */
	251	const uint8_t *s;
	252	UChar32 c;
	253	int32_t i, limit, index;
	254
	255	s=(const uint8_t *)iter->context;
	256	i=index=0;
	257	limit=iter->start; /* count up to the UTF-8 index */
	258	while(i<limit) {
	259	L8_NEXT(s, i, limit, c);
	260	if(c<=0xffff) {
	261	++index;
	262	} else {
	263	index+=2;
	264	}
	265	}
	266
	267	iter->start=i; /* just in case setState() did not get us to a code point boundary */
	268	if(i==iter->limit) {
	269	iter->length=index; /* in case it was <0 or wrong */
	270	}
	271	if(iter->reservedField!=0) {
	272	--index; /* we are in the middle of a supplementary code point */
	273	}
	274	iter->index=index;
	275	}
	276	return iter->index;
	277	case UITER_LIMIT:
	278	case UITER_LENGTH:
	279	if(iter->length<0) {
	280	const uint8_t *s;
	281	UChar32 c;
	282	int32_t i, limit, length;
	283
	284	s=(const uint8_t *)iter->context;
	285	if(iter->index<0) {
	286	/*
	287	* the current UTF-16 index is unknown after setState(),
	288	* we must first count from the beginning to here
	289	*/
	290	i=length=0;
	291	limit=iter->start;
	292
	293	/* count from the beginning to the current index */
	294	while(i<limit) {
	295	L8_NEXT(s, i, limit, c);
	296	if(c<=0xffff) {
	297	++length;
	298	} else {
	299	length+=2;
	300	}
	301	}
	302
	303	/* assume i==limit==iter->start, set the UTF-16 index */
	304	iter->start=i; /* just in case setState() did not get us to a code point boundary */
	305	iter->index= iter->reservedField!=0 ? length-1 : length;
	306	} else {
	307	i=iter->start;
	308	length=iter->index;
	309	if(iter->reservedField!=0) {
	310	++length;
	311	}
	312	}
	313
	314	/* count from the current index to the end */
	315	limit=iter->limit;
	316	while(i<limit) {
	317	L8_NEXT(s, i, limit, c);
	318	if(c<=0xffff) {
	319	++length;
	320	} else {
	321	length+=2;
	322	}
	323	}
	324	iter->length=length;
	325	}
	326	return iter->length;
	327	default:
	328	/* not a valid origin */
	329	/* Should never get here! */
	330	return -1;
	331	}
	332	}
	333
	334	static int32_t U_CALLCONV
	335	lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
	336	const uint8_t *s;
	337	UChar32 c;
	338	int32_t pos; /* requested UTF-16 index */
	339	int32_t i; /* UTF-8 index */
	340	UBool havePos;
	341
	342	/* calculate the requested UTF-16 index */
	343	switch(origin) {
	344	case UITER_ZERO:
	345	case UITER_START:
	346	pos=delta;
	347	havePos=TRUE;
	348	/* iter->index<0 (unknown) is possible */
	349	break;
	350	case UITER_CURRENT:
	351	if(iter->index>=0) {
	352	pos=iter->index+delta;
	353	havePos=TRUE;
	354	} else {
	355	/* the current UTF-16 index is unknown after setState(), use only delta */
	356	pos=0;
	357	havePos=FALSE;
	358	}
	359	break;
	360	case UITER_LIMIT:
	361	case UITER_LENGTH:
	362	if(iter->length>=0) {
	363	pos=iter->length+delta;
	364	havePos=TRUE;
	365	} else {
	366	/* pin to the end, avoid counting the length */
	367	iter->index=-1;
	368	iter->start=iter->limit;
	369	iter->reservedField=0;
	370	if(delta>=0) {
	371	return UITER_UNKNOWN_INDEX;
	372	} else {
	373	/* the current UTF-16 index is unknown, use only delta */
	374	pos=0;
	375	havePos=FALSE;
	376	}
	377	}
	378	break;
	379	default:
	380	return -1; /* Error */
	381	}
	382
	383	if(havePos) {
	384	/* shortcuts: pinning to the edges of the string */
	385	if(pos<=0) {
	386	iter->index=iter->start=iter->reservedField=0;
	387	return 0;
	388	} else if(iter->length>=0 && pos>=iter->length) {
	389	iter->index=iter->length;
	390	iter->start=iter->limit;
	391	iter->reservedField=0;
	392	return iter->index;
	393	}
	394
	395	/* minimize the number of L8_NEXT/PREV operations */
	396	if(iter->index<0 \|\| pos<iter->index/2) {
	397	/* go forward from the start instead of backward from the current index */
	398	iter->index=iter->start=iter->reservedField=0;
	399	} else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
	400	/*
	401	* if we have the UTF-16 index and length and the new position is
	402	* closer to the end than the current index,
	403	* then go backward from the end instead of forward from the current index
	404	*/
	405	iter->index=iter->length;
	406	iter->start=iter->limit;
	407	iter->reservedField=0;
	408	}
	409
	410	delta=pos-iter->index;
	411	if(delta==0) {
	412	return iter->index; /* nothing to do */
	413	}
	414	} else {
	415	/* move relative to unknown UTF-16 index */
	416	if(delta==0) {
	417	return UITER_UNKNOWN_INDEX; /* nothing to do */
	418	} else if(-delta>=iter->start) {
	419	/* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
	420	iter->index=iter->start=iter->reservedField=0;
	421	return 0;
	422	} else if(delta>=(iter->limit-iter->start)) {
	423	/* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
	424	iter->index=iter->length; /* may or may not be <0 (unknown) */
	425	iter->start=iter->limit;
	426	iter->reservedField=0;
	427	return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
	428	}
	429	}
	430
	431	/* delta!=0 */
	432
	433	/* move towards the requested position, pin to the edges of the string */
	434	s=(const uint8_t *)iter->context;
	435	pos=iter->index; /* could be <0 (unknown) */
	436	i=iter->start;
	437	if(delta>0) {
	438	/* go forward */
	439	int32_t limit=iter->limit;
	440	if(iter->reservedField!=0) {
	441	iter->reservedField=0;
	442	++pos;
	443	--delta;
	444	}
	445	while(delta>0 && i<limit) {
	446	L8_NEXT(s, i, limit, c);
	447	if(c<0xffff) {
	448	++pos;
	449	--delta;
	450	} else if(delta>=2) {
	451	pos+=2;
	452	delta-=2;
	453	} else /* delta==1 */ {
	454	/* stop in the middle of a supplementary code point */
	455	iter->reservedField=c;
	456	++pos;
	457	break; /* delta=0; */
	458	}
	459	}
	460	if(i==limit) {
	461	if(iter->length<0 && iter->index>=0) {
	462	iter->length= iter->reservedField==0 ? pos : pos+1;
	463	} else if(iter->index<0 && iter->length>=0) {
	464	iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
	465	}
	466	}
	467	} else /* delta<0 */ {
	468	/* go backward */
	469	if(iter->reservedField!=0) {
	470	iter->reservedField=0;
	471	i-=4; /* we stayed behind the supplementary code point; go before it now */
	472	--pos;
	473	++delta;
	474	}
	475	while(delta<0 && i>0) {
	476	L8_PREV(s, 0, i, c);
	477	if(c<0xffff) {
	478	--pos;
	479	++delta;
	480	} else if(delta<=-2) {
	481	pos-=2;
	482	delta+=2;
	483	} else /* delta==-1 */ {
	484	/* stop in the middle of a supplementary code point */
	485	i+=4; /* back to behind this supplementary code point for consistent state */
	486	iter->reservedField=c;
	487	--pos;
	488	break; /* delta=0; */
	489	}
	490	}
	491	}
	492
	493	iter->start=i;
	494	if(iter->index>=0) {
	495	return iter->index=pos;
	496	} else {
	497	/* we started with index<0 (unknown) so pos is bogus */
	498	if(i<=1) {
	499	return iter->index=i; /* reached the beginning */
	500	} else {
	501	/* we still don't know the UTF-16 index */
	502	return UITER_UNKNOWN_INDEX;
	503	}
	504	}
	505	}
	506
	507	static UBool U_CALLCONV
	508	lenient8IteratorHasNext(UCharIterator *iter) {
	509	return iter->reservedField!=0 \|\| iter->start<iter->limit;
	510	}
	511
	512	static UBool U_CALLCONV
	513	lenient8IteratorHasPrevious(UCharIterator *iter) {
	514	return iter->start>0;
	515	}
	516
	517	static UChar32 U_CALLCONV
	518	lenient8IteratorCurrent(UCharIterator *iter) {
	519	if(iter->reservedField!=0) {
	520	return U16_TRAIL(iter->reservedField);
	521	} else if(iter->start<iter->limit) {
	522	const uint8_t s=(const uint8_t )iter->context;
	523	UChar32 c;
	524	int32_t i=iter->start;
	525
	526	L8_NEXT(s, i, iter->limit, c);
	527	if(c<0) {
	528	return 0xfffd;
	529	} else if(c<=0xffff) {
	530	return c;
	531	} else {
	532	return U16_LEAD(c);
	533	}
	534	} else {
	535	return U_SENTINEL;
	536	}
	537	}
	538
	539	static UChar32 U_CALLCONV
	540	lenient8IteratorNext(UCharIterator *iter) {
	541	int32_t index;
	542
	543	if(iter->reservedField!=0) {
	544	UChar trail=U16_TRAIL(iter->reservedField);
	545	iter->reservedField=0;
	546	if((index=iter->index)>=0) {
	547	iter->index=index+1;
	548	}
	549	return trail;
	550	} else if(iter->start<iter->limit) {
	551	const uint8_t s=(const uint8_t )iter->context;
	552	UChar32 c;
	553
	554	L8_NEXT(s, iter->start, iter->limit, c);
	555	if((index=iter->index)>=0) {
	556	iter->index=++index;
	557	if(iter->length<0 && iter->start==iter->limit) {
	558	iter->length= c<=0xffff ? index : index+1;
	559	}
	560	} else if(iter->start==iter->limit && iter->length>=0) {
	561	iter->index= c<=0xffff ? iter->length : iter->length-1;
	562	}
	563	if(c<0) {
	564	return 0xfffd;
	565	} else if(c<=0xffff) {
	566	return c;
	567	} else {
	568	iter->reservedField=c;
	569	return U16_LEAD(c);
	570	}
	571	} else {
	572	return U_SENTINEL;
	573	}
	574	}
	575
	576	static UChar32 U_CALLCONV
	577	lenient8IteratorPrevious(UCharIterator *iter) {
	578	int32_t index;
	579
	580	if(iter->reservedField!=0) {
	581	UChar lead=U16_LEAD(iter->reservedField);
	582	iter->reservedField=0;
	583	iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
	584	if((index=iter->index)>0) {
	585	iter->index=index-1;
	586	}
	587	return lead;
	588	} else if(iter->start>0) {
	589	const uint8_t s=(const uint8_t )iter->context;
	590	UChar32 c;
	591
	592	L8_PREV(s, 0, iter->start, c);
	593	if((index=iter->index)>0) {
	594	iter->index=index-1;
	595	} else if(iter->start<=1) {
	596	iter->index= c<=0xffff ? iter->start : iter->start+1;
	597	}
	598	if(c<0) {
	599	return 0xfffd;
	600	} else if(c<=0xffff) {
	601	return c;
	602	} else {
	603	iter->start+=4; /* back to behind this supplementary code point for consistent state */
	604	iter->reservedField=c;
	605	return U16_TRAIL(c);
	606	}
	607	} else {
	608	return U_SENTINEL;
	609	}
	610	}
	611
	612	static uint32_t U_CALLCONV
	613	lenient8IteratorGetState(const UCharIterator *iter) {
	614	uint32_t state=(uint32_t)(iter->start<<1);
	615	if(iter->reservedField!=0) {
	616	state\|=1;
	617	}
	618	return state;
	619	}
	620
	621	static void U_CALLCONV
	622	lenient8IteratorSetState(UCharIterator iter, uint32_t state, UErrorCode pErrorCode) {
	623	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	624	/* do nothing */
	625	} else if(iter==NULL) {
	626	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	627	} else if(state==lenient8IteratorGetState(iter)) {
	628	/* setting to the current state: no-op */
	629	} else {
	630	int32_t index=(int32_t)(state>>1); /* UTF-8 index */
	631	state&=1; /* 1 if in surrogate pair, must be index>=4 */
	632
	633	if((state==0 ? index<0 : index<4) \|\| iter->limit<index) {
	634	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	635	} else {
	636	iter->start=index; /* restore UTF-8 byte index */
	637	if(index<=1) {
	638	iter->index=index;
	639	} else {
	640	iter->index=-1; /* unknown UTF-16 index */
	641	}
	642	if(state==0) {
	643	iter->reservedField=0;
	644	} else {
	645	/* verified index>=4 above */
	646	UChar32 c;
	647	L8_PREV((const uint8_t *)iter->context, 0, index, c);
	648	if(c<=0xffff) {
	649	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	650	} else {
	651	iter->reservedField=c;
	652	}
	653	}
	654	}
	655	}
	656	}
	657
	658	static const UCharIterator lenient8Iterator={
	659	0, 0, 0, 0, 0, 0,
	660	lenient8IteratorGetIndex,
	661	lenient8IteratorMove,
	662	lenient8IteratorHasNext,
	663	lenient8IteratorHasPrevious,
	664	lenient8IteratorCurrent,
	665	lenient8IteratorNext,
	666	lenient8IteratorPrevious,
	667	NULL,
	668	lenient8IteratorGetState,
	669	lenient8IteratorSetState
	670	};
	671
	672	U_CAPI void U_EXPORT2
	673	uiter_setLenient8(UCharIterator iter, const char s, int32_t length) {
	674	if(iter!=0) {
	675	if(s!=0 && length>=-1) {
	676	*iter=lenient8Iterator;
	677	iter->context=s;
	678	if(length>=0) {
	679	iter->limit=length;
	680	} else {
	681	iter->limit=strlen(s);
	682	}
	683	iter->length= iter->limit<=1 ? iter->limit : -1;
	684	} else {
	685	/* set no-op iterator */
	686	uiter_setString(iter, NULL, 0);
	687	}
	688	}
	689	}