git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	******************************************************************************
	5	*
	6	* Copyright (C) 1999-2012, International Business Machines
	7	* Corporation and others. All Rights Reserved.
	8	*
	9	******************************************************************************
	10	* file name: utf_impl.cpp
	11	* encoding: UTF-8
	12	* tab size: 8 (not used)
	13	* indentation:4
	14	*
	15	* created on: 1999sep13
	16	* created by: Markus W. Scherer
	17	*
	18	* This file provides implementation functions for macros in the utfXX.h
	19	* that would otherwise be too long as macros.
	20	*/
	21
	22	/* set import/export definitions */
	23	#ifndef U_UTF8_IMPL
	24	# define U_UTF8_IMPL
	25	#endif
	26
	27	#include "unicode/utypes.h"
	28	#include "unicode/utf.h"
	29	#include "unicode/utf8.h"
	30	#include "uassert.h"
	31
	32	/*
	33	* Table of the number of utf8 trail bytes, indexed by the lead byte.
	34	* Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h
	35	*
	36	* The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table.
	37	*
	38	* Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were
	39	* changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES
	40	* may exist in old client code that must continue to run with newer icu library versions.
	41	*
	42	* This table could be replaced on many machines by
	43	* a few lines of assembler code using an
	44	* "index of first 0-bit from msb" instruction and
	45	* one or two more integer instructions.
	46	*
	47	* For example, on an i386, do something like
	48	* - MOV AL, leadByte
	49	* - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0)
	50	* - MOV AH, 0
	51	* - BSR BX, AX (16-bit)
	52	* - MOV AX, 6 (result)
	53	* - JZ finish (ZF==1 if leadByte==0xff)
	54	* - SUB AX, BX (result)
	55	* -finish:
	56	* (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
	57	*/
	58	extern "C" U_EXPORT const uint8_t
	59	utf8_countTrailBytes[256]={
	60	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	61	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	62	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	63	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	64
	65	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	66	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	67	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	68	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	69
	70	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	71	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	72	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	73	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	74
	75	// illegal C0 & C1
	76	// 2-byte lead bytes C2..DF
	77	0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	78	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	79
	80	// 3-byte lead bytes E0..EF
	81	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	82	// 4-byte lead bytes F0..F4
	83	// illegal F5..FF
	84	3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
	85	};
	86
	87	static const UChar32
	88	utf8_errorValue[6]={
	89	// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
	90	// but without relying on the obsolete unicode/utf_old.h.
	91	0x15, 0x9f, 0xffff,
	92	0x10ffff
	93	};
	94
	95	static UChar32
	96	errorValue(int32_t count, int8_t strict) {
	97	if(strict>=0) {
	98	return utf8_errorValue[count];
	99	} else if(strict==-3) {
	100	return 0xfffd;
	101	} else {
	102	return U_SENTINEL;
	103	}
	104	}
	105
	106	/*
	107	* Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
	108	* and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
	109	*
	110	* U8_NEXT() supports NUL-terminated strings indicated via length<0.
	111	*
	112	* The "strict" parameter controls the error behavior:
	113	* <0 "Safe" behavior of U8_NEXT():
	114	* -1: All illegal byte sequences yield U_SENTINEL=-1.
	115	* -2: Same as -1, except for lenient treatment of surrogate code points as legal.
	116	* Some implementations use this for roundtripping of
	117	* Unicode 16-bit strings that are not well-formed UTF-16, that is, they
	118	* contain unpaired surrogates.
	119	* -3: All illegal byte sequences yield U+FFFD.
	120	* 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
	121	* All illegal byte sequences yield a positive code point such that this
	122	* result code point would be encoded with the same number of bytes as
	123	* the illegal sequence.
	124	* >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
	125	* Same as the obsolete "safe" behavior, but non-characters are also treated
	126	* like illegal sequences.
	127	*
	128	* Note that a UBool is the same as an int8_t.
	129	*/
	130	U_CAPI UChar32 U_EXPORT2
	131	utf8_nextCharSafeBody(const uint8_t s, int32_t pi, int32_t length, UChar32 c, UBool strict) {
	132	// *pi is one after byte c.
	133	int32_t i=*pi;
	134	// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
	135	if(i==length \|\| c>0xf4) {
	136	// end of string, or not a lead byte
	137	} else if(c>=0xf0) {
	138	// Test for 4-byte sequences first because
	139	// U8_NEXT() handles shorter valid sequences inline.
	140	uint8_t t1=s[i], t2, t3;
	141	c&=7;
	142	if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
	143	++i!=length && (t2=s[i]-0x80)<=0x3f &&
	144	++i!=length && (t3=s[i]-0x80)<=0x3f) {
	145	++i;
	146	c=(c<<18)\|((t1&0x3f)<<12)\|(t2<<6)\|t3;
	147	// strict: forbid non-characters like U+fffe
	148	if(strict<=0 \|\| !U_IS_UNICODE_NONCHAR(c)) {
	149	*pi=i;
	150	return c;
	151	}
	152	}
	153	} else if(c>=0xe0) {
	154	c&=0xf;
	155	if(strict!=-2) {
	156	uint8_t t1=s[i], t2;
	157	if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
	158	++i!=length && (t2=s[i]-0x80)<=0x3f) {
	159	++i;
	160	c=(c<<12)\|((t1&0x3f)<<6)\|t2;
	161	// strict: forbid non-characters like U+fffe
	162	if(strict<=0 \|\| !U_IS_UNICODE_NONCHAR(c)) {
	163	*pi=i;
	164	return c;
	165	}
	166	}
	167	} else {
	168	// strict=-2 -> lenient: allow surrogates
	169	uint8_t t1=s[i]-0x80, t2;
	170	if(t1<=0x3f && (c>0 \|\| t1>=0x20) &&
	171	++i!=length && (t2=s[i]-0x80)<=0x3f) {
	172	*pi=i+1;
	173	return (c<<12)\|(t1<<6)\|t2;
	174	}
	175	}
	176	} else if(c>=0xc2) {
	177	uint8_t t1=s[i]-0x80;
	178	if(t1<=0x3f) {
	179	*pi=i+1;
	180	return ((c-0xc0)<<6)\|t1;
	181	}
	182	} // else 0x80<=c<0xc2 is not a lead byte
	183
	184	/* error handling */
	185	c=errorValue(i-*pi, strict);
	186	*pi=i;
	187	return c;
	188	}
	189
	190	U_CAPI int32_t U_EXPORT2
	191	utf8_appendCharSafeBody(uint8_t s, int32_t i, int32_t length, UChar32 c, UBool pIsError) {
	192	if((uint32_t)(c)<=0x7ff) {
	193	if((i)+1<(length)) {
	194	(s)[(i)++]=(uint8_t)(((c)>>6)\|0xc0);
	195	(s)[(i)++]=(uint8_t)(((c)&0x3f)\|0x80);
	196	return i;
	197	}
	198	} else if((uint32_t)(c)<=0xffff) {
	199	/* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
	200	if((i)+2<(length) && !U_IS_SURROGATE(c)) {
	201	(s)[(i)++]=(uint8_t)(((c)>>12)\|0xe0);
	202	(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)\|0x80);
	203	(s)[(i)++]=(uint8_t)(((c)&0x3f)\|0x80);
	204	return i;
	205	}
	206	} else if((uint32_t)(c)<=0x10ffff) {
	207	if((i)+3<(length)) {
	208	(s)[(i)++]=(uint8_t)(((c)>>18)\|0xf0);
	209	(s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)\|0x80);
	210	(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)\|0x80);
	211	(s)[(i)++]=(uint8_t)(((c)&0x3f)\|0x80);
	212	return i;
	213	}
	214	}
	215	/* c>0x10ffff or not enough space, write an error value */
	216	if(pIsError!=NULL) {
	217	*pIsError=TRUE;
	218	} else {
	219	length-=i;
	220	if(length>0) {
	221	int32_t offset;
	222	if(length>3) {
	223	length=3;
	224	}
	225	s+=i;
	226	offset=0;
	227	c=utf8_errorValue[length-1];
	228	U8_APPEND_UNSAFE(s, offset, c);
	229	i=i+offset;
	230	}
	231	}
	232	return i;
	233	}
	234
	235	U_CAPI UChar32 U_EXPORT2
	236	utf8_prevCharSafeBody(const uint8_t s, int32_t start, int32_t pi, UChar32 c, UBool strict) {
	237	// *pi is the index of byte c.
	238	int32_t i=*pi;
	239	if(U8_IS_TRAIL(c) && i>start) {
	240	uint8_t b1=s[--i];
	241	if(U8_IS_LEAD(b1)) {
	242	if(b1<0xe0) {
	243	*pi=i;
	244	return ((b1-0xc0)<<6)\|(c&0x3f);
	245	} else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
	246	// Truncated 3- or 4-byte sequence.
	247	*pi=i;
	248	return errorValue(1, strict);
	249	}
	250	} else if(U8_IS_TRAIL(b1) && i>start) {
	251	// Extract the value bits from the last trail byte.
	252	c&=0x3f;
	253	uint8_t b2=s[--i];
	254	if(0xe0<=b2 && b2<=0xf4) {
	255	if(b2<0xf0) {
	256	b2&=0xf;
	257	if(strict!=-2) {
	258	if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
	259	*pi=i;
	260	c=(b2<<12)\|((b1&0x3f)<<6)\|c;
	261	if(strict<=0 \|\| !U_IS_UNICODE_NONCHAR(c)) {
	262	return c;
	263	} else {
	264	// strict: forbid non-characters like U+fffe
	265	return errorValue(2, strict);
	266	}
	267	}
	268	} else {
	269	// strict=-2 -> lenient: allow surrogates
	270	b1-=0x80;
	271	if((b2>0 \|\| b1>=0x20)) {
	272	*pi=i;
	273	return (b2<<12)\|(b1<<6)\|c;
	274	}
	275	}
	276	} else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
	277	// Truncated 4-byte sequence.
	278	*pi=i;
	279	return errorValue(2, strict);
	280	}
	281	} else if(U8_IS_TRAIL(b2) && i>start) {
	282	uint8_t b3=s[--i];
	283	if(0xf0<=b3 && b3<=0xf4) {
	284	b3&=7;
	285	if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
	286	*pi=i;
	287	c=(b3<<18)\|((b2&0x3f)<<12)\|((b1&0x3f)<<6)\|c;
	288	if(strict<=0 \|\| !U_IS_UNICODE_NONCHAR(c)) {
	289	return c;
	290	} else {
	291	// strict: forbid non-characters like U+fffe
	292	return errorValue(3, strict);
	293	}
	294	}
	295	}
	296	}
	297	}
	298	}
	299	return errorValue(0, strict);
	300	}
	301
	302	U_CAPI int32_t U_EXPORT2
	303	utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
	304	// Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
	305	int32_t orig_i=i;
	306	uint8_t c=s[i];
	307	if(U8_IS_TRAIL(c) && i>start) {
	308	uint8_t b1=s[--i];
	309	if(U8_IS_LEAD(b1)) {
	310	if(b1<0xe0 \|\|
	311	(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
	312	return i;
	313	}
	314	} else if(U8_IS_TRAIL(b1) && i>start) {
	315	uint8_t b2=s[--i];
	316	if(0xe0<=b2 && b2<=0xf4) {
	317	if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
	318	return i;
	319	}
	320	} else if(U8_IS_TRAIL(b2) && i>start) {
	321	uint8_t b3=s[--i];
	322	if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
	323	return i;
	324	}
	325	}
	326	}
	327	}
	328	return orig_i;
	329	}