]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/utf_impl.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / utf_impl.c
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 1999-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: utf_impl.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 1999sep13
14 * created by: Markus W. Scherer
15 *
16 * This file provides implementation functions for macros in the utfXX.h
17 * that would otherwise be too long as macros.
18 */
19
20 /* set import/export definitions */
21 #ifndef U_UTF8_IMPL
22 # define U_UTF8_IMPL
23 #endif
24
25 #include "unicode/utypes.h"
26
27 /*
28 * This table could be replaced on many machines by
29 * a few lines of assembler code using an
30 * "index of first 0-bit from msb" instruction and
31 * one or two more integer instructions.
32 *
33 * For example, on an i386, do something like
34 * - MOV AL, leadByte
35 * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0)
36 * - MOV AH, 0
37 * - BSR BX, AX (16-bit)
38 * - MOV AX, 6 (result)
39 * - JZ finish (ZF==1 if leadByte==0xff)
40 * - SUB AX, BX (result)
41 * -finish:
42 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
43 *
44 * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
45 * lead bytes above 0xf4 are illegal.
46 * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
47 */
48 U_EXPORT const uint8_t
49 utf8_countTrailBytes[256]={
50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64
65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67
68 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
69 3, 3, 3, 3, 3,
70 3, 3, 3, /* illegal in Unicode */
71 4, 4, 4, 4, /* illegal in Unicode */
72 5, 5, /* illegal in Unicode */
73 0, 0 /* illegal bytes 0xfe and 0xff */
74 };
75
76 static const UChar32
77 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
78
79 static const UChar32
80 utf8_errorValue[6]={
81 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
82 0x3ffffff, 0x7fffffff
83 };
84
85 U_CAPI UChar32 U_EXPORT2
86 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
87 int32_t i=*pi;
88 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
89 if((i)+count<=(length)) {
90 uint8_t trail, illegal=0;
91
92 UTF8_MASK_LEAD_BYTE((c), count);
93 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
94 switch(count) {
95 /* each branch falls through to the next one */
96 case 5:
97 case 4:
98 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
99 illegal=1;
100 break;
101 case 3:
102 trail=s[(i)++];
103 (c)=((c)<<6)|(trail&0x3f);
104 if(c<0x110) {
105 illegal|=(trail&0xc0)^0x80;
106 } else {
107 /* code point>0x10ffff, outside Unicode */
108 illegal=1;
109 break;
110 }
111 case 2:
112 trail=s[(i)++];
113 (c)=((c)<<6)|(trail&0x3f);
114 illegal|=(trail&0xc0)^0x80;
115 case 1:
116 trail=s[(i)++];
117 (c)=((c)<<6)|(trail&0x3f);
118 illegal|=(trail&0xc0)^0x80;
119 break;
120 case 0:
121 if(strict>=0) {
122 return UTF8_ERROR_VALUE_1;
123 } else {
124 return U_SENTINEL;
125 }
126 /* no default branch to optimize switch() - all values are covered */
127 }
128
129 /*
130 * All the error handling should return a value
131 * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
132 *
133 * Starting with Unicode 3.0.1, non-shortest forms are illegal.
134 * Starting with Unicode 3.2, surrogate code points must not be
135 * encoded in UTF-8, and there are no irregular sequences any more.
136 *
137 * U8_ macros (new in ICU 2.4) return negative values for error conditions.
138 */
139
140 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
141 /* illegal is also set if count>=4 */
142 if(illegal || (c)<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
143 /* error handling */
144 uint8_t errorCount=count;
145 /* don't go beyond this sequence */
146 i=*pi;
147 while(count>0 && UTF8_IS_TRAIL(s[i])) {
148 ++(i);
149 --count;
150 }
151 if(strict>=0) {
152 c=utf8_errorValue[errorCount-count];
153 } else {
154 c=U_SENTINEL;
155 }
156 } else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) {
157 /* strict: forbid non-characters like U+fffe */
158 c=utf8_errorValue[count];
159 }
160 } else /* too few bytes left */ {
161 /* error handling */
162 int32_t i0=i;
163 /* don't just set (i)=(length) in case there is an illegal sequence */
164 while((i)<(length) && UTF8_IS_TRAIL(s[i])) {
165 ++(i);
166 }
167 if(strict>=0) {
168 c=utf8_errorValue[i-i0];
169 } else {
170 c=U_SENTINEL;
171 }
172 }
173 *pi=i;
174 return c;
175 }
176
177 U_CAPI int32_t U_EXPORT2
178 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
179 if((uint32_t)(c)<=0x7ff) {
180 if((i)+1<(length)) {
181 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
182 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
183 return i;
184 }
185 } else if((uint32_t)(c)<=0xffff) {
186 /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
187 if((i)+2<(length) && !U_IS_SURROGATE(c)) {
188 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
189 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
190 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
191 return i;
192 }
193 } else if((uint32_t)(c)<=0x10ffff) {
194 if((i)+3<(length)) {
195 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
196 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
197 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
198 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
199 return i;
200 }
201 }
202 /* c>0x10ffff or not enough space, write an error value */
203 if(pIsError!=NULL) {
204 *pIsError=TRUE;
205 } else {
206 length-=i;
207 if(length>0) {
208 int32_t offset;
209 if(length>3) {
210 length=3;
211 }
212 s+=i;
213 offset=0;
214 c=utf8_errorValue[length-1];
215 UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
216 i=i+offset;
217 }
218 }
219 return i;
220 }
221
222 U_CAPI UChar32 U_EXPORT2
223 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
224 int32_t i=*pi;
225 uint8_t b, count=1, shift=6;
226
227 /* extract value bits from the last trail byte */
228 c&=0x3f;
229
230 for(;;) {
231 if(i<=start) {
232 /* no lead byte at all */
233 if(strict>=0) {
234 return UTF8_ERROR_VALUE_1;
235 } else {
236 return U_SENTINEL;
237 }
238 break;
239 }
240
241 /* read another previous byte */
242 b=s[--i];
243 if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
244 if(b&0x40) {
245 /* lead byte, this will always end the loop */
246 uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);
247
248 if(count==shouldCount) {
249 /* set the new position */
250 *pi=i;
251 UTF8_MASK_LEAD_BYTE(b, count);
252 c|=(UChar32)b<<shift;
253 if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c) || (strict>0 && UTF_IS_UNICODE_NONCHAR(c))) {
254 /* illegal sequence or (strict and non-character) */
255 if(count>=4) {
256 count=3;
257 }
258 if(strict>=0) {
259 c=utf8_errorValue[count];
260 } else {
261 c=U_SENTINEL;
262 }
263 } else {
264 /* exit with correct c */
265 }
266 } else {
267 /* the lead byte does not match the number of trail bytes */
268 /* only set the position to the lead byte if it would
269 include the trail byte that we started with */
270 if(count<shouldCount) {
271 *pi=i;
272 if(strict>=0) {
273 c=utf8_errorValue[count];
274 } else {
275 c=U_SENTINEL;
276 }
277 } else {
278 if(strict>=0) {
279 c=UTF8_ERROR_VALUE_1;
280 } else {
281 c=U_SENTINEL;
282 }
283 }
284 }
285 break;
286 } else if(count<5) {
287 /* trail byte */
288 c|=(UChar32)(b&0x3f)<<shift;
289 ++count;
290 shift+=6;
291 } else {
292 /* more than 5 trail bytes is illegal */
293 if(strict>=0) {
294 c=UTF8_ERROR_VALUE_1;
295 } else {
296 c=U_SENTINEL;
297 }
298 break;
299 }
300 } else {
301 /* single-byte character precedes trailing bytes */
302 if(strict>=0) {
303 c=UTF8_ERROR_VALUE_1;
304 } else {
305 c=U_SENTINEL;
306 }
307 break;
308 }
309 }
310 return c;
311 }
312
313 U_CAPI int32_t U_EXPORT2
314 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
315 /* i had been decremented once before the function call */
316 int32_t I=i, Z;
317 uint8_t b;
318
319 /* read at most the 6 bytes s[Z] to s[i], inclusively */
320 if(I-5>start) {
321 Z=I-5;
322 } else {
323 Z=start;
324 }
325
326 /* return I if the sequence starting there is long enough to include i */
327 do {
328 b=s[I];
329 if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
330 break;
331 } else if(b>=0xc0) {
332 if(UTF8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
333 return I;
334 } else {
335 break;
336 }
337 }
338 } while(Z<=--I);
339
340 /* return i itself to be consistent with the FWD_1 macro */
341 return i;
342 }