]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/samples/uciter8/uit_len8.c
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / samples / uciter8 / uit_len8.c
... / ...
CommitLineData
1/*
2*******************************************************************************
3*
4* Copyright (C) 2003, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: uit_len8.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2003feb10
14* created by: Markus W. Scherer
15*
16* This file contains the implementation of the "lenient UTF-8" UCharIterator
17* as used in the uciter8 sample code.
18* UTF-8-style macros are defined as well as the UCharIterator.
19* The macros are incomplete (do not assemble code points from pairs of
20* surrogates, see comment below)
21* but sufficient for the iterator.
22*/
23
24#include <string.h>
25#include "unicode/utypes.h"
26#include "unicode/uiter.h"
27
28/* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
29
30/*
31 * This code leniently reads 8-bit Unicode strings,
32 * which could contain a mix of UTF-8 and CESU-8.
33 * More precisely:
34 * - supplementary code points may be encoded with dedicated 4-byte sequences
35 * (UTF-8 style)
36 * - supplementary code points may be encoded with
37 * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
38 * (CESU-8 style)
39 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
40 *
41 * Limitation:
42 * Right now, the macros do not attempt to assemble code points from pairs of
43 * separately encoded surrogates.
44 * This would not be sufficient for processing based on these macros,
45 * but it is sufficient for a UCharIterator that returns only UChars anyway.
46 *
47 * The code is copied and modified from utf_impl.c and utf8.h.
48 * The "strict" argument in the implementation functions is completely removed,
49 * using the "<0" branch from the original code.
50 * Checks for surrogate code points are removed for the leniency
51 * described above.
52 */
53
54static const UChar32
55lenient8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
56
57static UChar32
58lenient8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c) {
59 int32_t i=*pi;
60 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
61 if((i)+count<=(length)) {
62 uint8_t trail, illegal=0;
63
64 U8_MASK_LEAD_BYTE((c), count);
65 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
66 switch(count) {
67 /* each branch falls through to the next one */
68 case 5:
69 case 4:
70 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
71 illegal=1;
72 break;
73 case 3:
74 trail=s[(i)++];
75 (c)=((c)<<6)|(trail&0x3f);
76 if(c<0x110) {
77 illegal|=(trail&0xc0)^0x80;
78 } else {
79 /* code point>0x10ffff, outside Unicode */
80 illegal=1;
81 break;
82 }
83 case 2:
84 trail=s[(i)++];
85 (c)=((c)<<6)|(trail&0x3f);
86 illegal|=(trail&0xc0)^0x80;
87 case 1:
88 trail=s[(i)++];
89 (c)=((c)<<6)|(trail&0x3f);
90 illegal|=(trail&0xc0)^0x80;
91 break;
92 case 0:
93 return U_SENTINEL;
94 /* no default branch to optimize switch() - all values are covered */
95 }
96
97 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
98 /* illegal is also set if count>=4 */
99 if(illegal || (c)<lenient8_minLegal[count]) {
100 /* error handling */
101 uint8_t errorCount=count;
102 /* don't go beyond this sequence */
103 i=*pi;
104 while(count>0 && U8_IS_TRAIL(s[i])) {
105 ++(i);
106 --count;
107 }
108 c=U_SENTINEL;
109 }
110 } else /* too few bytes left */ {
111 /* error handling */
112 int32_t i0=i;
113 /* don't just set (i)=(length) in case there is an illegal sequence */
114 while((i)<(length) && U8_IS_TRAIL(s[i])) {
115 ++(i);
116 }
117 c=U_SENTINEL;
118 }
119 *pi=i;
120 return c;
121}
122
123static UChar32
124lenient8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c) {
125 int32_t i=*pi;
126 uint8_t b, count=1, shift=6;
127
128 /* extract value bits from the last trail byte */
129 c&=0x3f;
130
131 for(;;) {
132 if(i<=start) {
133 /* no lead byte at all */
134 return U_SENTINEL;
135 }
136
137 /* read another previous byte */
138 b=s[--i];
139 if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
140 if(b&0x40) {
141 /* lead byte, this will always end the loop */
142 uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
143
144 if(count==shouldCount) {
145 /* set the new position */
146 *pi=i;
147 U8_MASK_LEAD_BYTE(b, count);
148 c|=(UChar32)b<<shift;
149 if(count>=4 || c>0x10ffff || c<lenient8_minLegal[count]) {
150 /* illegal sequence */
151 if(count>=4) {
152 count=3;
153 }
154 c=U_SENTINEL;
155 } else {
156 /* exit with correct c */
157 }
158 } else {
159 /* the lead byte does not match the number of trail bytes */
160 /* only set the position to the lead byte if it would
161 include the trail byte that we started with */
162 if(count<shouldCount) {
163 *pi=i;
164 }
165 c=U_SENTINEL;
166 }
167 break;
168 } else if(count<5) {
169 /* trail byte */
170 c|=(UChar32)(b&0x3f)<<shift;
171 ++count;
172 shift+=6;
173 } else {
174 /* more than 5 trail bytes is illegal */
175 c=U_SENTINEL;
176 break;
177 }
178 } else {
179 /* single-byte character precedes trailing bytes */
180 c=U_SENTINEL;
181 break;
182 }
183 }
184 return c;
185}
186
187#define L8_NEXT(s, i, length, c) { \
188 (c)=(s)[(i)++]; \
189 if((c)>=0x80) { \
190 if(U8_IS_LEAD(c)) { \
191 (c)=lenient8_nextCharSafeBody(s, &(i), (int32_t)(length), c); \
192 } else { \
193 (c)=U_SENTINEL; \
194 } \
195 } \
196}
197
198#define L8_PREV(s, start, i, c) { \
199 (c)=(s)[--(i)]; \
200 if((c)>=0x80) { \
201 if((c)<=0xbf) { \
202 (c)=lenient8_prevCharSafeBody(s, start, &(i), c); \
203 } else { \
204 (c)=U_SENTINEL; \
205 } \
206 } \
207}
208
209/* lenient-8 UCharIterator -------------------------------------------------- */
210
211/*
212 * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
213 * except that it uses the lenient-8-bit-Unicode macros above.
214 */
215
216/*
217 * Minimal implementation:
218 * Maintain a single-UChar buffer for an additional surrogate.
219 * The caller must not modify start and limit because they are used internally.
220 *
221 * Use UCharIterator fields as follows:
222 * context pointer to UTF-8 string
223 * length UTF-16 length of the string; -1 until lazy evaluation
224 * start current UTF-8 index
225 * index current UTF-16 index; may be -1="unknown" after setState()
226 * limit UTF-8 length of the string
227 * reservedField supplementary code point
228 *
229 * Since UCharIterator delivers 16-bit code units, the iteration can be
230 * currently in the middle of the byte sequence for a supplementary code point.
231 * In this case, reservedField will contain that code point and start will
232 * point to after the corresponding byte sequence. The UTF-16 index will be
233 * one less than what it would otherwise be corresponding to the UTF-8 index.
234 * Otherwise, reservedField will be 0.
235 */
236
237/*
238 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
239 * Add implementations that do not call strlen() for iteration but check for NUL.
240 */
241
242static int32_t U_CALLCONV
243lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
244 switch(origin) {
245 case UITER_ZERO:
246 case UITER_START:
247 return 0;
248 case UITER_CURRENT:
249 if(iter->index<0) {
250 /* the current UTF-16 index is unknown after setState(), count from the beginning */
251 const uint8_t *s;
252 UChar32 c;
253 int32_t i, limit, index;
254
255 s=(const uint8_t *)iter->context;
256 i=index=0;
257 limit=iter->start; /* count up to the UTF-8 index */
258 while(i<limit) {
259 L8_NEXT(s, i, limit, c);
260 if(c<=0xffff) {
261 ++index;
262 } else {
263 index+=2;
264 }
265 }
266
267 iter->start=i; /* just in case setState() did not get us to a code point boundary */
268 if(i==iter->limit) {
269 iter->length=index; /* in case it was <0 or wrong */
270 }
271 if(iter->reservedField!=0) {
272 --index; /* we are in the middle of a supplementary code point */
273 }
274 iter->index=index;
275 }
276 return iter->index;
277 case UITER_LIMIT:
278 case UITER_LENGTH:
279 if(iter->length<0) {
280 const uint8_t *s;
281 UChar32 c;
282 int32_t i, limit, length;
283
284 s=(const uint8_t *)iter->context;
285 if(iter->index<0) {
286 /*
287 * the current UTF-16 index is unknown after setState(),
288 * we must first count from the beginning to here
289 */
290 i=length=0;
291 limit=iter->start;
292
293 /* count from the beginning to the current index */
294 while(i<limit) {
295 L8_NEXT(s, i, limit, c);
296 if(c<=0xffff) {
297 ++length;
298 } else {
299 length+=2;
300 }
301 }
302
303 /* assume i==limit==iter->start, set the UTF-16 index */
304 iter->start=i; /* just in case setState() did not get us to a code point boundary */
305 iter->index= iter->reservedField!=0 ? length-1 : length;
306 } else {
307 i=iter->start;
308 length=iter->index;
309 if(iter->reservedField!=0) {
310 ++length;
311 }
312 }
313
314 /* count from the current index to the end */
315 limit=iter->limit;
316 while(i<limit) {
317 L8_NEXT(s, i, limit, c);
318 if(c<=0xffff) {
319 ++length;
320 } else {
321 length+=2;
322 }
323 }
324 iter->length=length;
325 }
326 return iter->length;
327 default:
328 /* not a valid origin */
329 /* Should never get here! */
330 return -1;
331 }
332}
333
334static int32_t U_CALLCONV
335lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
336 const uint8_t *s;
337 UChar32 c;
338 int32_t pos; /* requested UTF-16 index */
339 int32_t i; /* UTF-8 index */
340 UBool havePos;
341
342 /* calculate the requested UTF-16 index */
343 switch(origin) {
344 case UITER_ZERO:
345 case UITER_START:
346 pos=delta;
347 havePos=TRUE;
348 /* iter->index<0 (unknown) is possible */
349 break;
350 case UITER_CURRENT:
351 if(iter->index>=0) {
352 pos=iter->index+delta;
353 havePos=TRUE;
354 } else {
355 /* the current UTF-16 index is unknown after setState(), use only delta */
356 pos=0;
357 havePos=FALSE;
358 }
359 break;
360 case UITER_LIMIT:
361 case UITER_LENGTH:
362 if(iter->length>=0) {
363 pos=iter->length+delta;
364 havePos=TRUE;
365 } else {
366 /* pin to the end, avoid counting the length */
367 iter->index=-1;
368 iter->start=iter->limit;
369 iter->reservedField=0;
370 if(delta>=0) {
371 return UITER_UNKNOWN_INDEX;
372 } else {
373 /* the current UTF-16 index is unknown, use only delta */
374 pos=0;
375 havePos=FALSE;
376 }
377 }
378 break;
379 default:
380 return -1; /* Error */
381 }
382
383 if(havePos) {
384 /* shortcuts: pinning to the edges of the string */
385 if(pos<=0) {
386 iter->index=iter->start=iter->reservedField=0;
387 return 0;
388 } else if(iter->length>=0 && pos>=iter->length) {
389 iter->index=iter->length;
390 iter->start=iter->limit;
391 iter->reservedField=0;
392 return iter->index;
393 }
394
395 /* minimize the number of L8_NEXT/PREV operations */
396 if(iter->index<0 || pos<iter->index/2) {
397 /* go forward from the start instead of backward from the current index */
398 iter->index=iter->start=iter->reservedField=0;
399 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
400 /*
401 * if we have the UTF-16 index and length and the new position is
402 * closer to the end than the current index,
403 * then go backward from the end instead of forward from the current index
404 */
405 iter->index=iter->length;
406 iter->start=iter->limit;
407 iter->reservedField=0;
408 }
409
410 delta=pos-iter->index;
411 if(delta==0) {
412 return iter->index; /* nothing to do */
413 }
414 } else {
415 /* move relative to unknown UTF-16 index */
416 if(delta==0) {
417 return UITER_UNKNOWN_INDEX; /* nothing to do */
418 } else if(-delta>=iter->start) {
419 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
420 iter->index=iter->start=iter->reservedField=0;
421 return 0;
422 } else if(delta>=(iter->limit-iter->start)) {
423 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
424 iter->index=iter->length; /* may or may not be <0 (unknown) */
425 iter->start=iter->limit;
426 iter->reservedField=0;
427 return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
428 }
429 }
430
431 /* delta!=0 */
432
433 /* move towards the requested position, pin to the edges of the string */
434 s=(const uint8_t *)iter->context;
435 pos=iter->index; /* could be <0 (unknown) */
436 i=iter->start;
437 if(delta>0) {
438 /* go forward */
439 int32_t limit=iter->limit;
440 if(iter->reservedField!=0) {
441 iter->reservedField=0;
442 ++pos;
443 --delta;
444 }
445 while(delta>0 && i<limit) {
446 L8_NEXT(s, i, limit, c);
447 if(c<0xffff) {
448 ++pos;
449 --delta;
450 } else if(delta>=2) {
451 pos+=2;
452 delta-=2;
453 } else /* delta==1 */ {
454 /* stop in the middle of a supplementary code point */
455 iter->reservedField=c;
456 ++pos;
457 break; /* delta=0; */
458 }
459 }
460 if(i==limit) {
461 if(iter->length<0 && iter->index>=0) {
462 iter->length= iter->reservedField==0 ? pos : pos+1;
463 } else if(iter->index<0 && iter->length>=0) {
464 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
465 }
466 }
467 } else /* delta<0 */ {
468 /* go backward */
469 if(iter->reservedField!=0) {
470 iter->reservedField=0;
471 i-=4; /* we stayed behind the supplementary code point; go before it now */
472 --pos;
473 ++delta;
474 }
475 while(delta<0 && i>0) {
476 L8_PREV(s, 0, i, c);
477 if(c<0xffff) {
478 --pos;
479 ++delta;
480 } else if(delta<=-2) {
481 pos-=2;
482 delta+=2;
483 } else /* delta==-1 */ {
484 /* stop in the middle of a supplementary code point */
485 i+=4; /* back to behind this supplementary code point for consistent state */
486 iter->reservedField=c;
487 --pos;
488 break; /* delta=0; */
489 }
490 }
491 }
492
493 iter->start=i;
494 if(iter->index>=0) {
495 return iter->index=pos;
496 } else {
497 /* we started with index<0 (unknown) so pos is bogus */
498 if(i<=1) {
499 return iter->index=i; /* reached the beginning */
500 } else {
501 /* we still don't know the UTF-16 index */
502 return UITER_UNKNOWN_INDEX;
503 }
504 }
505}
506
507static UBool U_CALLCONV
508lenient8IteratorHasNext(UCharIterator *iter) {
509 return iter->reservedField!=0 || iter->start<iter->limit;
510}
511
512static UBool U_CALLCONV
513lenient8IteratorHasPrevious(UCharIterator *iter) {
514 return iter->start>0;
515}
516
517static UChar32 U_CALLCONV
518lenient8IteratorCurrent(UCharIterator *iter) {
519 if(iter->reservedField!=0) {
520 return U16_TRAIL(iter->reservedField);
521 } else if(iter->start<iter->limit) {
522 const uint8_t *s=(const uint8_t *)iter->context;
523 UChar32 c;
524 int32_t i=iter->start;
525
526 L8_NEXT(s, i, iter->limit, c);
527 if(c<0) {
528 return 0xfffd;
529 } else if(c<=0xffff) {
530 return c;
531 } else {
532 return U16_LEAD(c);
533 }
534 } else {
535 return U_SENTINEL;
536 }
537}
538
539static UChar32 U_CALLCONV
540lenient8IteratorNext(UCharIterator *iter) {
541 int32_t index;
542
543 if(iter->reservedField!=0) {
544 UChar trail=U16_TRAIL(iter->reservedField);
545 iter->reservedField=0;
546 if((index=iter->index)>=0) {
547 iter->index=index+1;
548 }
549 return trail;
550 } else if(iter->start<iter->limit) {
551 const uint8_t *s=(const uint8_t *)iter->context;
552 UChar32 c;
553
554 L8_NEXT(s, iter->start, iter->limit, c);
555 if((index=iter->index)>=0) {
556 iter->index=++index;
557 if(iter->length<0 && iter->start==iter->limit) {
558 iter->length= c<=0xffff ? index : index+1;
559 }
560 } else if(iter->start==iter->limit && iter->length>=0) {
561 iter->index= c<=0xffff ? iter->length : iter->length-1;
562 }
563 if(c<0) {
564 return 0xfffd;
565 } else if(c<=0xffff) {
566 return c;
567 } else {
568 iter->reservedField=c;
569 return U16_LEAD(c);
570 }
571 } else {
572 return U_SENTINEL;
573 }
574}
575
576static UChar32 U_CALLCONV
577lenient8IteratorPrevious(UCharIterator *iter) {
578 int32_t index;
579
580 if(iter->reservedField!=0) {
581 UChar lead=U16_LEAD(iter->reservedField);
582 iter->reservedField=0;
583 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
584 if((index=iter->index)>0) {
585 iter->index=index-1;
586 }
587 return lead;
588 } else if(iter->start>0) {
589 const uint8_t *s=(const uint8_t *)iter->context;
590 UChar32 c;
591
592 L8_PREV(s, 0, iter->start, c);
593 if((index=iter->index)>0) {
594 iter->index=index-1;
595 } else if(iter->start<=1) {
596 iter->index= c<=0xffff ? iter->start : iter->start+1;
597 }
598 if(c<0) {
599 return 0xfffd;
600 } else if(c<=0xffff) {
601 return c;
602 } else {
603 iter->start+=4; /* back to behind this supplementary code point for consistent state */
604 iter->reservedField=c;
605 return U16_TRAIL(c);
606 }
607 } else {
608 return U_SENTINEL;
609 }
610}
611
612static uint32_t U_CALLCONV
613lenient8IteratorGetState(const UCharIterator *iter) {
614 uint32_t state=(uint32_t)(iter->start<<1);
615 if(iter->reservedField!=0) {
616 state|=1;
617 }
618 return state;
619}
620
621static void U_CALLCONV
622lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
623 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
624 /* do nothing */
625 } else if(iter==NULL) {
626 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
627 } else if(state==lenient8IteratorGetState(iter)) {
628 /* setting to the current state: no-op */
629 } else {
630 int32_t index=(int32_t)(state>>1); /* UTF-8 index */
631 state&=1; /* 1 if in surrogate pair, must be index>=4 */
632
633 if((state==0 ? index<0 : index<4) || iter->limit<index) {
634 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
635 } else {
636 iter->start=index; /* restore UTF-8 byte index */
637 if(index<=1) {
638 iter->index=index;
639 } else {
640 iter->index=-1; /* unknown UTF-16 index */
641 }
642 if(state==0) {
643 iter->reservedField=0;
644 } else {
645 /* verified index>=4 above */
646 UChar32 c;
647 L8_PREV((const uint8_t *)iter->context, 0, index, c);
648 if(c<=0xffff) {
649 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
650 } else {
651 iter->reservedField=c;
652 }
653 }
654 }
655 }
656}
657
658static const UCharIterator lenient8Iterator={
659 0, 0, 0, 0, 0, 0,
660 lenient8IteratorGetIndex,
661 lenient8IteratorMove,
662 lenient8IteratorHasNext,
663 lenient8IteratorHasPrevious,
664 lenient8IteratorCurrent,
665 lenient8IteratorNext,
666 lenient8IteratorPrevious,
667 NULL,
668 lenient8IteratorGetState,
669 lenient8IteratorSetState
670};
671
672U_CAPI void U_EXPORT2
673uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
674 if(iter!=0) {
675 if(s!=0 && length>=-1) {
676 *iter=lenient8Iterator;
677 iter->context=s;
678 if(length>=0) {
679 iter->limit=length;
680 } else {
681 iter->limit=strlen(s);
682 }
683 iter->length= iter->limit<=1 ? iter->limit : -1;
684 } else {
685 /* set no-op iterator */
686 uiter_setString(iter, NULL, 0);
687 }
688 }
689}