]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2003, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: uit_len8.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2003feb10 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This file contains the implementation of the "lenient UTF-8" UCharIterator | |
17 | * as used in the uciter8 sample code. | |
18 | * UTF-8-style macros are defined as well as the UCharIterator. | |
19 | * The macros are incomplete (do not assemble code points from pairs of | |
20 | * surrogates, see comment below) | |
21 | * but sufficient for the iterator. | |
22 | */ | |
23 | ||
24 | #include <string.h> | |
25 | #include "unicode/utypes.h" | |
26 | #include "unicode/uiter.h" | |
27 | ||
28 | /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */ | |
29 | ||
30 | /* | |
31 | * This code leniently reads 8-bit Unicode strings, | |
32 | * which could contain a mix of UTF-8 and CESU-8. | |
33 | * More precisely: | |
34 | * - supplementary code points may be encoded with dedicated 4-byte sequences | |
35 | * (UTF-8 style) | |
36 | * - supplementary code points may be encoded with | |
37 | * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form | |
38 | * (CESU-8 style) | |
39 | * - single surrogates are allowed, encoded with their "natural" 3-byte sequences | |
40 | * | |
41 | * Limitation: | |
42 | * Right now, the macros do not attempt to assemble code points from pairs of | |
43 | * separately encoded surrogates. | |
44 | * This would not be sufficient for processing based on these macros, | |
45 | * but it is sufficient for a UCharIterator that returns only UChars anyway. | |
46 | * | |
47 | * The code is copied and modified from utf_impl.c and utf8.h. | |
48 | * The "strict" argument in the implementation functions is completely removed, | |
49 | * using the "<0" branch from the original code. | |
50 | * Checks for surrogate code points are removed for the leniency | |
51 | * described above. | |
52 | */ | |
53 | ||
54 | static const UChar32 | |
55 | lenient8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; | |
56 | ||
57 | static UChar32 | |
58 | lenient8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c) { | |
59 | int32_t i=*pi; | |
60 | uint8_t count=U8_COUNT_TRAIL_BYTES(c); | |
61 | if((i)+count<=(length)) { | |
62 | uint8_t trail, illegal=0; | |
63 | ||
64 | U8_MASK_LEAD_BYTE((c), count); | |
65 | /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ | |
66 | switch(count) { | |
67 | /* each branch falls through to the next one */ | |
68 | case 5: | |
69 | case 4: | |
70 | /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ | |
71 | illegal=1; | |
72 | break; | |
73 | case 3: | |
74 | trail=s[(i)++]; | |
75 | (c)=((c)<<6)|(trail&0x3f); | |
76 | if(c<0x110) { | |
77 | illegal|=(trail&0xc0)^0x80; | |
78 | } else { | |
79 | /* code point>0x10ffff, outside Unicode */ | |
80 | illegal=1; | |
81 | break; | |
82 | } | |
83 | case 2: | |
84 | trail=s[(i)++]; | |
85 | (c)=((c)<<6)|(trail&0x3f); | |
86 | illegal|=(trail&0xc0)^0x80; | |
87 | case 1: | |
88 | trail=s[(i)++]; | |
89 | (c)=((c)<<6)|(trail&0x3f); | |
90 | illegal|=(trail&0xc0)^0x80; | |
91 | break; | |
92 | case 0: | |
93 | return U_SENTINEL; | |
94 | /* no default branch to optimize switch() - all values are covered */ | |
95 | } | |
96 | ||
97 | /* correct sequence - all trail bytes have (b7..b6)==(10)? */ | |
98 | /* illegal is also set if count>=4 */ | |
99 | if(illegal || (c)<lenient8_minLegal[count]) { | |
100 | /* error handling */ | |
101 | uint8_t errorCount=count; | |
102 | /* don't go beyond this sequence */ | |
103 | i=*pi; | |
104 | while(count>0 && U8_IS_TRAIL(s[i])) { | |
105 | ++(i); | |
106 | --count; | |
107 | } | |
108 | c=U_SENTINEL; | |
109 | } | |
110 | } else /* too few bytes left */ { | |
111 | /* error handling */ | |
112 | int32_t i0=i; | |
113 | /* don't just set (i)=(length) in case there is an illegal sequence */ | |
114 | while((i)<(length) && U8_IS_TRAIL(s[i])) { | |
115 | ++(i); | |
116 | } | |
117 | c=U_SENTINEL; | |
118 | } | |
119 | *pi=i; | |
120 | return c; | |
121 | } | |
122 | ||
123 | static UChar32 | |
124 | lenient8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c) { | |
125 | int32_t i=*pi; | |
126 | uint8_t b, count=1, shift=6; | |
127 | ||
128 | /* extract value bits from the last trail byte */ | |
129 | c&=0x3f; | |
130 | ||
131 | for(;;) { | |
132 | if(i<=start) { | |
133 | /* no lead byte at all */ | |
134 | return U_SENTINEL; | |
135 | } | |
136 | ||
137 | /* read another previous byte */ | |
138 | b=s[--i]; | |
139 | if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */ | |
140 | if(b&0x40) { | |
141 | /* lead byte, this will always end the loop */ | |
142 | uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b); | |
143 | ||
144 | if(count==shouldCount) { | |
145 | /* set the new position */ | |
146 | *pi=i; | |
147 | U8_MASK_LEAD_BYTE(b, count); | |
148 | c|=(UChar32)b<<shift; | |
149 | if(count>=4 || c>0x10ffff || c<lenient8_minLegal[count]) { | |
150 | /* illegal sequence */ | |
151 | if(count>=4) { | |
152 | count=3; | |
153 | } | |
154 | c=U_SENTINEL; | |
155 | } else { | |
156 | /* exit with correct c */ | |
157 | } | |
158 | } else { | |
159 | /* the lead byte does not match the number of trail bytes */ | |
160 | /* only set the position to the lead byte if it would | |
161 | include the trail byte that we started with */ | |
162 | if(count<shouldCount) { | |
163 | *pi=i; | |
164 | } | |
165 | c=U_SENTINEL; | |
166 | } | |
167 | break; | |
168 | } else if(count<5) { | |
169 | /* trail byte */ | |
170 | c|=(UChar32)(b&0x3f)<<shift; | |
171 | ++count; | |
172 | shift+=6; | |
173 | } else { | |
174 | /* more than 5 trail bytes is illegal */ | |
175 | c=U_SENTINEL; | |
176 | break; | |
177 | } | |
178 | } else { | |
179 | /* single-byte character precedes trailing bytes */ | |
180 | c=U_SENTINEL; | |
181 | break; | |
182 | } | |
183 | } | |
184 | return c; | |
185 | } | |
186 | ||
187 | #define L8_NEXT(s, i, length, c) { \ | |
188 | (c)=(s)[(i)++]; \ | |
189 | if((c)>=0x80) { \ | |
190 | if(U8_IS_LEAD(c)) { \ | |
191 | (c)=lenient8_nextCharSafeBody(s, &(i), (int32_t)(length), c); \ | |
192 | } else { \ | |
193 | (c)=U_SENTINEL; \ | |
194 | } \ | |
195 | } \ | |
196 | } | |
197 | ||
198 | #define L8_PREV(s, start, i, c) { \ | |
199 | (c)=(s)[--(i)]; \ | |
200 | if((c)>=0x80) { \ | |
201 | if((c)<=0xbf) { \ | |
202 | (c)=lenient8_prevCharSafeBody(s, start, &(i), c); \ | |
203 | } else { \ | |
204 | (c)=U_SENTINEL; \ | |
205 | } \ | |
206 | } \ | |
207 | } | |
208 | ||
209 | /* lenient-8 UCharIterator -------------------------------------------------- */ | |
210 | ||
211 | /* | |
212 | * This is a copy of the UTF-8 UCharIterator in uiter.cpp, | |
213 | * except that it uses the lenient-8-bit-Unicode macros above. | |
214 | */ | |
215 | ||
216 | /* | |
217 | * Minimal implementation: | |
218 | * Maintain a single-UChar buffer for an additional surrogate. | |
219 | * The caller must not modify start and limit because they are used internally. | |
220 | * | |
221 | * Use UCharIterator fields as follows: | |
222 | * context pointer to UTF-8 string | |
223 | * length UTF-16 length of the string; -1 until lazy evaluation | |
224 | * start current UTF-8 index | |
225 | * index current UTF-16 index; may be -1="unknown" after setState() | |
226 | * limit UTF-8 length of the string | |
227 | * reservedField supplementary code point | |
228 | * | |
229 | * Since UCharIterator delivers 16-bit code units, the iteration can be | |
230 | * currently in the middle of the byte sequence for a supplementary code point. | |
231 | * In this case, reservedField will contain that code point and start will | |
232 | * point to after the corresponding byte sequence. The UTF-16 index will be | |
233 | * one less than what it would otherwise be corresponding to the UTF-8 index. | |
234 | * Otherwise, reservedField will be 0. | |
235 | */ | |
236 | ||
237 | /* | |
238 | * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: | |
239 | * Add implementations that do not call strlen() for iteration but check for NUL. | |
240 | */ | |
241 | ||
242 | static int32_t U_CALLCONV | |
243 | lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { | |
244 | switch(origin) { | |
245 | case UITER_ZERO: | |
246 | case UITER_START: | |
247 | return 0; | |
248 | case UITER_CURRENT: | |
249 | if(iter->index<0) { | |
250 | /* the current UTF-16 index is unknown after setState(), count from the beginning */ | |
251 | const uint8_t *s; | |
252 | UChar32 c; | |
253 | int32_t i, limit, index; | |
254 | ||
255 | s=(const uint8_t *)iter->context; | |
256 | i=index=0; | |
257 | limit=iter->start; /* count up to the UTF-8 index */ | |
258 | while(i<limit) { | |
259 | L8_NEXT(s, i, limit, c); | |
260 | if(c<=0xffff) { | |
261 | ++index; | |
262 | } else { | |
263 | index+=2; | |
264 | } | |
265 | } | |
266 | ||
267 | iter->start=i; /* just in case setState() did not get us to a code point boundary */ | |
268 | if(i==iter->limit) { | |
269 | iter->length=index; /* in case it was <0 or wrong */ | |
270 | } | |
271 | if(iter->reservedField!=0) { | |
272 | --index; /* we are in the middle of a supplementary code point */ | |
273 | } | |
274 | iter->index=index; | |
275 | } | |
276 | return iter->index; | |
277 | case UITER_LIMIT: | |
278 | case UITER_LENGTH: | |
279 | if(iter->length<0) { | |
280 | const uint8_t *s; | |
281 | UChar32 c; | |
282 | int32_t i, limit, length; | |
283 | ||
284 | s=(const uint8_t *)iter->context; | |
285 | if(iter->index<0) { | |
286 | /* | |
287 | * the current UTF-16 index is unknown after setState(), | |
288 | * we must first count from the beginning to here | |
289 | */ | |
290 | i=length=0; | |
291 | limit=iter->start; | |
292 | ||
293 | /* count from the beginning to the current index */ | |
294 | while(i<limit) { | |
295 | L8_NEXT(s, i, limit, c); | |
296 | if(c<=0xffff) { | |
297 | ++length; | |
298 | } else { | |
299 | length+=2; | |
300 | } | |
301 | } | |
302 | ||
303 | /* assume i==limit==iter->start, set the UTF-16 index */ | |
304 | iter->start=i; /* just in case setState() did not get us to a code point boundary */ | |
305 | iter->index= iter->reservedField!=0 ? length-1 : length; | |
306 | } else { | |
307 | i=iter->start; | |
308 | length=iter->index; | |
309 | if(iter->reservedField!=0) { | |
310 | ++length; | |
311 | } | |
312 | } | |
313 | ||
314 | /* count from the current index to the end */ | |
315 | limit=iter->limit; | |
316 | while(i<limit) { | |
317 | L8_NEXT(s, i, limit, c); | |
318 | if(c<=0xffff) { | |
319 | ++length; | |
320 | } else { | |
321 | length+=2; | |
322 | } | |
323 | } | |
324 | iter->length=length; | |
325 | } | |
326 | return iter->length; | |
327 | default: | |
328 | /* not a valid origin */ | |
329 | /* Should never get here! */ | |
330 | return -1; | |
331 | } | |
332 | } | |
333 | ||
334 | static int32_t U_CALLCONV | |
335 | lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { | |
336 | const uint8_t *s; | |
337 | UChar32 c; | |
338 | int32_t pos; /* requested UTF-16 index */ | |
339 | int32_t i; /* UTF-8 index */ | |
340 | UBool havePos; | |
341 | ||
342 | /* calculate the requested UTF-16 index */ | |
343 | switch(origin) { | |
344 | case UITER_ZERO: | |
345 | case UITER_START: | |
346 | pos=delta; | |
347 | havePos=TRUE; | |
348 | /* iter->index<0 (unknown) is possible */ | |
349 | break; | |
350 | case UITER_CURRENT: | |
351 | if(iter->index>=0) { | |
352 | pos=iter->index+delta; | |
353 | havePos=TRUE; | |
354 | } else { | |
355 | /* the current UTF-16 index is unknown after setState(), use only delta */ | |
356 | pos=0; | |
357 | havePos=FALSE; | |
358 | } | |
359 | break; | |
360 | case UITER_LIMIT: | |
361 | case UITER_LENGTH: | |
362 | if(iter->length>=0) { | |
363 | pos=iter->length+delta; | |
364 | havePos=TRUE; | |
365 | } else { | |
366 | /* pin to the end, avoid counting the length */ | |
367 | iter->index=-1; | |
368 | iter->start=iter->limit; | |
369 | iter->reservedField=0; | |
370 | if(delta>=0) { | |
371 | return UITER_UNKNOWN_INDEX; | |
372 | } else { | |
373 | /* the current UTF-16 index is unknown, use only delta */ | |
374 | pos=0; | |
375 | havePos=FALSE; | |
376 | } | |
377 | } | |
378 | break; | |
379 | default: | |
380 | return -1; /* Error */ | |
381 | } | |
382 | ||
383 | if(havePos) { | |
384 | /* shortcuts: pinning to the edges of the string */ | |
385 | if(pos<=0) { | |
386 | iter->index=iter->start=iter->reservedField=0; | |
387 | return 0; | |
388 | } else if(iter->length>=0 && pos>=iter->length) { | |
389 | iter->index=iter->length; | |
390 | iter->start=iter->limit; | |
391 | iter->reservedField=0; | |
392 | return iter->index; | |
393 | } | |
394 | ||
395 | /* minimize the number of L8_NEXT/PREV operations */ | |
396 | if(iter->index<0 || pos<iter->index/2) { | |
397 | /* go forward from the start instead of backward from the current index */ | |
398 | iter->index=iter->start=iter->reservedField=0; | |
399 | } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { | |
400 | /* | |
401 | * if we have the UTF-16 index and length and the new position is | |
402 | * closer to the end than the current index, | |
403 | * then go backward from the end instead of forward from the current index | |
404 | */ | |
405 | iter->index=iter->length; | |
406 | iter->start=iter->limit; | |
407 | iter->reservedField=0; | |
408 | } | |
409 | ||
410 | delta=pos-iter->index; | |
411 | if(delta==0) { | |
412 | return iter->index; /* nothing to do */ | |
413 | } | |
414 | } else { | |
415 | /* move relative to unknown UTF-16 index */ | |
416 | if(delta==0) { | |
417 | return UITER_UNKNOWN_INDEX; /* nothing to do */ | |
418 | } else if(-delta>=iter->start) { | |
419 | /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ | |
420 | iter->index=iter->start=iter->reservedField=0; | |
421 | return 0; | |
422 | } else if(delta>=(iter->limit-iter->start)) { | |
423 | /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ | |
424 | iter->index=iter->length; /* may or may not be <0 (unknown) */ | |
425 | iter->start=iter->limit; | |
426 | iter->reservedField=0; | |
427 | return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX; | |
428 | } | |
429 | } | |
430 | ||
431 | /* delta!=0 */ | |
432 | ||
433 | /* move towards the requested position, pin to the edges of the string */ | |
434 | s=(const uint8_t *)iter->context; | |
435 | pos=iter->index; /* could be <0 (unknown) */ | |
436 | i=iter->start; | |
437 | if(delta>0) { | |
438 | /* go forward */ | |
439 | int32_t limit=iter->limit; | |
440 | if(iter->reservedField!=0) { | |
441 | iter->reservedField=0; | |
442 | ++pos; | |
443 | --delta; | |
444 | } | |
445 | while(delta>0 && i<limit) { | |
446 | L8_NEXT(s, i, limit, c); | |
447 | if(c<0xffff) { | |
448 | ++pos; | |
449 | --delta; | |
450 | } else if(delta>=2) { | |
451 | pos+=2; | |
452 | delta-=2; | |
453 | } else /* delta==1 */ { | |
454 | /* stop in the middle of a supplementary code point */ | |
455 | iter->reservedField=c; | |
456 | ++pos; | |
457 | break; /* delta=0; */ | |
458 | } | |
459 | } | |
460 | if(i==limit) { | |
461 | if(iter->length<0 && iter->index>=0) { | |
462 | iter->length= iter->reservedField==0 ? pos : pos+1; | |
463 | } else if(iter->index<0 && iter->length>=0) { | |
464 | iter->index= iter->reservedField==0 ? iter->length : iter->length-1; | |
465 | } | |
466 | } | |
467 | } else /* delta<0 */ { | |
468 | /* go backward */ | |
469 | if(iter->reservedField!=0) { | |
470 | iter->reservedField=0; | |
471 | i-=4; /* we stayed behind the supplementary code point; go before it now */ | |
472 | --pos; | |
473 | ++delta; | |
474 | } | |
475 | while(delta<0 && i>0) { | |
476 | L8_PREV(s, 0, i, c); | |
477 | if(c<0xffff) { | |
478 | --pos; | |
479 | ++delta; | |
480 | } else if(delta<=-2) { | |
481 | pos-=2; | |
482 | delta+=2; | |
483 | } else /* delta==-1 */ { | |
484 | /* stop in the middle of a supplementary code point */ | |
485 | i+=4; /* back to behind this supplementary code point for consistent state */ | |
486 | iter->reservedField=c; | |
487 | --pos; | |
488 | break; /* delta=0; */ | |
489 | } | |
490 | } | |
491 | } | |
492 | ||
493 | iter->start=i; | |
494 | if(iter->index>=0) { | |
495 | return iter->index=pos; | |
496 | } else { | |
497 | /* we started with index<0 (unknown) so pos is bogus */ | |
498 | if(i<=1) { | |
499 | return iter->index=i; /* reached the beginning */ | |
500 | } else { | |
501 | /* we still don't know the UTF-16 index */ | |
502 | return UITER_UNKNOWN_INDEX; | |
503 | } | |
504 | } | |
505 | } | |
506 | ||
507 | static UBool U_CALLCONV | |
508 | lenient8IteratorHasNext(UCharIterator *iter) { | |
509 | return iter->reservedField!=0 || iter->start<iter->limit; | |
510 | } | |
511 | ||
512 | static UBool U_CALLCONV | |
513 | lenient8IteratorHasPrevious(UCharIterator *iter) { | |
514 | return iter->start>0; | |
515 | } | |
516 | ||
517 | static UChar32 U_CALLCONV | |
518 | lenient8IteratorCurrent(UCharIterator *iter) { | |
519 | if(iter->reservedField!=0) { | |
520 | return U16_TRAIL(iter->reservedField); | |
521 | } else if(iter->start<iter->limit) { | |
522 | const uint8_t *s=(const uint8_t *)iter->context; | |
523 | UChar32 c; | |
524 | int32_t i=iter->start; | |
525 | ||
526 | L8_NEXT(s, i, iter->limit, c); | |
527 | if(c<0) { | |
528 | return 0xfffd; | |
529 | } else if(c<=0xffff) { | |
530 | return c; | |
531 | } else { | |
532 | return U16_LEAD(c); | |
533 | } | |
534 | } else { | |
535 | return U_SENTINEL; | |
536 | } | |
537 | } | |
538 | ||
539 | static UChar32 U_CALLCONV | |
540 | lenient8IteratorNext(UCharIterator *iter) { | |
541 | int32_t index; | |
542 | ||
543 | if(iter->reservedField!=0) { | |
544 | UChar trail=U16_TRAIL(iter->reservedField); | |
545 | iter->reservedField=0; | |
546 | if((index=iter->index)>=0) { | |
547 | iter->index=index+1; | |
548 | } | |
549 | return trail; | |
550 | } else if(iter->start<iter->limit) { | |
551 | const uint8_t *s=(const uint8_t *)iter->context; | |
552 | UChar32 c; | |
553 | ||
554 | L8_NEXT(s, iter->start, iter->limit, c); | |
555 | if((index=iter->index)>=0) { | |
556 | iter->index=++index; | |
557 | if(iter->length<0 && iter->start==iter->limit) { | |
558 | iter->length= c<=0xffff ? index : index+1; | |
559 | } | |
560 | } else if(iter->start==iter->limit && iter->length>=0) { | |
561 | iter->index= c<=0xffff ? iter->length : iter->length-1; | |
562 | } | |
563 | if(c<0) { | |
564 | return 0xfffd; | |
565 | } else if(c<=0xffff) { | |
566 | return c; | |
567 | } else { | |
568 | iter->reservedField=c; | |
569 | return U16_LEAD(c); | |
570 | } | |
571 | } else { | |
572 | return U_SENTINEL; | |
573 | } | |
574 | } | |
575 | ||
576 | static UChar32 U_CALLCONV | |
577 | lenient8IteratorPrevious(UCharIterator *iter) { | |
578 | int32_t index; | |
579 | ||
580 | if(iter->reservedField!=0) { | |
581 | UChar lead=U16_LEAD(iter->reservedField); | |
582 | iter->reservedField=0; | |
583 | iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ | |
584 | if((index=iter->index)>0) { | |
585 | iter->index=index-1; | |
586 | } | |
587 | return lead; | |
588 | } else if(iter->start>0) { | |
589 | const uint8_t *s=(const uint8_t *)iter->context; | |
590 | UChar32 c; | |
591 | ||
592 | L8_PREV(s, 0, iter->start, c); | |
593 | if((index=iter->index)>0) { | |
594 | iter->index=index-1; | |
595 | } else if(iter->start<=1) { | |
596 | iter->index= c<=0xffff ? iter->start : iter->start+1; | |
597 | } | |
598 | if(c<0) { | |
599 | return 0xfffd; | |
600 | } else if(c<=0xffff) { | |
601 | return c; | |
602 | } else { | |
603 | iter->start+=4; /* back to behind this supplementary code point for consistent state */ | |
604 | iter->reservedField=c; | |
605 | return U16_TRAIL(c); | |
606 | } | |
607 | } else { | |
608 | return U_SENTINEL; | |
609 | } | |
610 | } | |
611 | ||
612 | static uint32_t U_CALLCONV | |
613 | lenient8IteratorGetState(const UCharIterator *iter) { | |
614 | uint32_t state=(uint32_t)(iter->start<<1); | |
615 | if(iter->reservedField!=0) { | |
616 | state|=1; | |
617 | } | |
618 | return state; | |
619 | } | |
620 | ||
621 | static void U_CALLCONV | |
622 | lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { | |
623 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
624 | /* do nothing */ | |
625 | } else if(iter==NULL) { | |
626 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
627 | } else if(state==lenient8IteratorGetState(iter)) { | |
628 | /* setting to the current state: no-op */ | |
629 | } else { | |
630 | int32_t index=(int32_t)(state>>1); /* UTF-8 index */ | |
631 | state&=1; /* 1 if in surrogate pair, must be index>=4 */ | |
632 | ||
633 | if((state==0 ? index<0 : index<4) || iter->limit<index) { | |
634 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
635 | } else { | |
636 | iter->start=index; /* restore UTF-8 byte index */ | |
637 | if(index<=1) { | |
638 | iter->index=index; | |
639 | } else { | |
640 | iter->index=-1; /* unknown UTF-16 index */ | |
641 | } | |
642 | if(state==0) { | |
643 | iter->reservedField=0; | |
644 | } else { | |
645 | /* verified index>=4 above */ | |
646 | UChar32 c; | |
647 | L8_PREV((const uint8_t *)iter->context, 0, index, c); | |
648 | if(c<=0xffff) { | |
649 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
650 | } else { | |
651 | iter->reservedField=c; | |
652 | } | |
653 | } | |
654 | } | |
655 | } | |
656 | } | |
657 | ||
658 | static const UCharIterator lenient8Iterator={ | |
659 | 0, 0, 0, 0, 0, 0, | |
660 | lenient8IteratorGetIndex, | |
661 | lenient8IteratorMove, | |
662 | lenient8IteratorHasNext, | |
663 | lenient8IteratorHasPrevious, | |
664 | lenient8IteratorCurrent, | |
665 | lenient8IteratorNext, | |
666 | lenient8IteratorPrevious, | |
667 | NULL, | |
668 | lenient8IteratorGetState, | |
669 | lenient8IteratorSetState | |
670 | }; | |
671 | ||
672 | U_CAPI void U_EXPORT2 | |
673 | uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) { | |
674 | if(iter!=0) { | |
675 | if(s!=0 && length>=-1) { | |
676 | *iter=lenient8Iterator; | |
677 | iter->context=s; | |
678 | if(length>=0) { | |
679 | iter->limit=length; | |
680 | } else { | |
681 | iter->limit=strlen(s); | |
682 | } | |
683 | iter->length= iter->limit<=1 ? iter->limit : -1; | |
684 | } else { | |
685 | /* set no-op iterator */ | |
686 | uiter_setString(iter, NULL, 0); | |
687 | } | |
688 | } | |
689 | } |