]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
73c04bcf | 4 | * Copyright (C) 2003-2006, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: uit_len8.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2003feb10 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This file contains the implementation of the "lenient UTF-8" UCharIterator | |
17 | * as used in the uciter8 sample code. | |
18 | * UTF-8-style macros are defined as well as the UCharIterator. | |
19 | * The macros are incomplete (do not assemble code points from pairs of | |
20 | * surrogates, see comment below) | |
21 | * but sufficient for the iterator. | |
22 | */ | |
23 | ||
24 | #include <string.h> | |
25 | #include "unicode/utypes.h" | |
26 | #include "unicode/uiter.h" | |
27 | ||
28 | /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */ | |
29 | ||
30 | /* | |
31 | * This code leniently reads 8-bit Unicode strings, | |
32 | * which could contain a mix of UTF-8 and CESU-8. | |
33 | * More precisely: | |
34 | * - supplementary code points may be encoded with dedicated 4-byte sequences | |
35 | * (UTF-8 style) | |
36 | * - supplementary code points may be encoded with | |
37 | * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form | |
38 | * (CESU-8 style) | |
39 | * - single surrogates are allowed, encoded with their "natural" 3-byte sequences | |
40 | * | |
41 | * Limitation: | |
42 | * Right now, the macros do not attempt to assemble code points from pairs of | |
43 | * separately encoded surrogates. | |
44 | * This would not be sufficient for processing based on these macros, | |
45 | * but it is sufficient for a UCharIterator that returns only UChars anyway. | |
46 | * | |
47 | * The code is copied and modified from utf_impl.c and utf8.h. | |
73c04bcf A |
48 | * |
49 | * Change 2006feb08: Much of the implementation code is replaced by calling | |
50 | * the utf_impl.c functions which accept a new "strict" parameter value | |
51 | * of -2 implementing exactly this leniency. | |
b75a7d8f A |
52 | */ |
53 | ||
b75a7d8f | 54 | #define L8_NEXT(s, i, length, c) { \ |
73c04bcf | 55 | (c)=(uint8_t)(s)[(i)++]; \ |
b75a7d8f A |
56 | if((c)>=0x80) { \ |
57 | if(U8_IS_LEAD(c)) { \ | |
73c04bcf | 58 | (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \ |
b75a7d8f A |
59 | } else { \ |
60 | (c)=U_SENTINEL; \ | |
61 | } \ | |
62 | } \ | |
63 | } | |
64 | ||
65 | #define L8_PREV(s, start, i, c) { \ | |
73c04bcf | 66 | (c)=(uint8_t)(s)[--(i)]; \ |
b75a7d8f A |
67 | if((c)>=0x80) { \ |
68 | if((c)<=0xbf) { \ | |
73c04bcf | 69 | (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \ |
b75a7d8f A |
70 | } else { \ |
71 | (c)=U_SENTINEL; \ | |
72 | } \ | |
73 | } \ | |
74 | } | |
75 | ||
76 | /* lenient-8 UCharIterator -------------------------------------------------- */ | |
77 | ||
78 | /* | |
79 | * This is a copy of the UTF-8 UCharIterator in uiter.cpp, | |
80 | * except that it uses the lenient-8-bit-Unicode macros above. | |
81 | */ | |
82 | ||
83 | /* | |
84 | * Minimal implementation: | |
85 | * Maintain a single-UChar buffer for an additional surrogate. | |
86 | * The caller must not modify start and limit because they are used internally. | |
87 | * | |
88 | * Use UCharIterator fields as follows: | |
89 | * context pointer to UTF-8 string | |
90 | * length UTF-16 length of the string; -1 until lazy evaluation | |
91 | * start current UTF-8 index | |
92 | * index current UTF-16 index; may be -1="unknown" after setState() | |
93 | * limit UTF-8 length of the string | |
94 | * reservedField supplementary code point | |
95 | * | |
96 | * Since UCharIterator delivers 16-bit code units, the iteration can be | |
97 | * currently in the middle of the byte sequence for a supplementary code point. | |
98 | * In this case, reservedField will contain that code point and start will | |
99 | * point to after the corresponding byte sequence. The UTF-16 index will be | |
100 | * one less than what it would otherwise be corresponding to the UTF-8 index. | |
101 | * Otherwise, reservedField will be 0. | |
102 | */ | |
103 | ||
104 | /* | |
105 | * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: | |
106 | * Add implementations that do not call strlen() for iteration but check for NUL. | |
107 | */ | |
108 | ||
109 | static int32_t U_CALLCONV | |
110 | lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { | |
111 | switch(origin) { | |
112 | case UITER_ZERO: | |
113 | case UITER_START: | |
114 | return 0; | |
115 | case UITER_CURRENT: | |
116 | if(iter->index<0) { | |
117 | /* the current UTF-16 index is unknown after setState(), count from the beginning */ | |
118 | const uint8_t *s; | |
119 | UChar32 c; | |
120 | int32_t i, limit, index; | |
121 | ||
122 | s=(const uint8_t *)iter->context; | |
123 | i=index=0; | |
124 | limit=iter->start; /* count up to the UTF-8 index */ | |
125 | while(i<limit) { | |
126 | L8_NEXT(s, i, limit, c); | |
127 | if(c<=0xffff) { | |
128 | ++index; | |
129 | } else { | |
130 | index+=2; | |
131 | } | |
132 | } | |
133 | ||
134 | iter->start=i; /* just in case setState() did not get us to a code point boundary */ | |
135 | if(i==iter->limit) { | |
136 | iter->length=index; /* in case it was <0 or wrong */ | |
137 | } | |
138 | if(iter->reservedField!=0) { | |
139 | --index; /* we are in the middle of a supplementary code point */ | |
140 | } | |
141 | iter->index=index; | |
142 | } | |
143 | return iter->index; | |
144 | case UITER_LIMIT: | |
145 | case UITER_LENGTH: | |
146 | if(iter->length<0) { | |
147 | const uint8_t *s; | |
148 | UChar32 c; | |
149 | int32_t i, limit, length; | |
150 | ||
151 | s=(const uint8_t *)iter->context; | |
152 | if(iter->index<0) { | |
153 | /* | |
154 | * the current UTF-16 index is unknown after setState(), | |
155 | * we must first count from the beginning to here | |
156 | */ | |
157 | i=length=0; | |
158 | limit=iter->start; | |
159 | ||
160 | /* count from the beginning to the current index */ | |
161 | while(i<limit) { | |
162 | L8_NEXT(s, i, limit, c); | |
163 | if(c<=0xffff) { | |
164 | ++length; | |
165 | } else { | |
166 | length+=2; | |
167 | } | |
168 | } | |
169 | ||
170 | /* assume i==limit==iter->start, set the UTF-16 index */ | |
171 | iter->start=i; /* just in case setState() did not get us to a code point boundary */ | |
172 | iter->index= iter->reservedField!=0 ? length-1 : length; | |
173 | } else { | |
174 | i=iter->start; | |
175 | length=iter->index; | |
176 | if(iter->reservedField!=0) { | |
177 | ++length; | |
178 | } | |
179 | } | |
180 | ||
181 | /* count from the current index to the end */ | |
182 | limit=iter->limit; | |
183 | while(i<limit) { | |
184 | L8_NEXT(s, i, limit, c); | |
185 | if(c<=0xffff) { | |
186 | ++length; | |
187 | } else { | |
188 | length+=2; | |
189 | } | |
190 | } | |
191 | iter->length=length; | |
192 | } | |
193 | return iter->length; | |
194 | default: | |
195 | /* not a valid origin */ | |
196 | /* Should never get here! */ | |
197 | return -1; | |
198 | } | |
199 | } | |
200 | ||
201 | static int32_t U_CALLCONV | |
202 | lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { | |
203 | const uint8_t *s; | |
204 | UChar32 c; | |
205 | int32_t pos; /* requested UTF-16 index */ | |
206 | int32_t i; /* UTF-8 index */ | |
207 | UBool havePos; | |
208 | ||
209 | /* calculate the requested UTF-16 index */ | |
210 | switch(origin) { | |
211 | case UITER_ZERO: | |
212 | case UITER_START: | |
213 | pos=delta; | |
214 | havePos=TRUE; | |
215 | /* iter->index<0 (unknown) is possible */ | |
216 | break; | |
217 | case UITER_CURRENT: | |
218 | if(iter->index>=0) { | |
219 | pos=iter->index+delta; | |
220 | havePos=TRUE; | |
221 | } else { | |
222 | /* the current UTF-16 index is unknown after setState(), use only delta */ | |
223 | pos=0; | |
224 | havePos=FALSE; | |
225 | } | |
226 | break; | |
227 | case UITER_LIMIT: | |
228 | case UITER_LENGTH: | |
229 | if(iter->length>=0) { | |
230 | pos=iter->length+delta; | |
231 | havePos=TRUE; | |
232 | } else { | |
233 | /* pin to the end, avoid counting the length */ | |
234 | iter->index=-1; | |
235 | iter->start=iter->limit; | |
236 | iter->reservedField=0; | |
237 | if(delta>=0) { | |
238 | return UITER_UNKNOWN_INDEX; | |
239 | } else { | |
240 | /* the current UTF-16 index is unknown, use only delta */ | |
241 | pos=0; | |
242 | havePos=FALSE; | |
243 | } | |
244 | } | |
245 | break; | |
246 | default: | |
247 | return -1; /* Error */ | |
248 | } | |
249 | ||
250 | if(havePos) { | |
251 | /* shortcuts: pinning to the edges of the string */ | |
252 | if(pos<=0) { | |
253 | iter->index=iter->start=iter->reservedField=0; | |
254 | return 0; | |
255 | } else if(iter->length>=0 && pos>=iter->length) { | |
256 | iter->index=iter->length; | |
257 | iter->start=iter->limit; | |
258 | iter->reservedField=0; | |
259 | return iter->index; | |
260 | } | |
261 | ||
262 | /* minimize the number of L8_NEXT/PREV operations */ | |
263 | if(iter->index<0 || pos<iter->index/2) { | |
264 | /* go forward from the start instead of backward from the current index */ | |
265 | iter->index=iter->start=iter->reservedField=0; | |
266 | } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { | |
267 | /* | |
268 | * if we have the UTF-16 index and length and the new position is | |
269 | * closer to the end than the current index, | |
270 | * then go backward from the end instead of forward from the current index | |
271 | */ | |
272 | iter->index=iter->length; | |
273 | iter->start=iter->limit; | |
274 | iter->reservedField=0; | |
275 | } | |
276 | ||
277 | delta=pos-iter->index; | |
278 | if(delta==0) { | |
279 | return iter->index; /* nothing to do */ | |
280 | } | |
281 | } else { | |
282 | /* move relative to unknown UTF-16 index */ | |
283 | if(delta==0) { | |
284 | return UITER_UNKNOWN_INDEX; /* nothing to do */ | |
285 | } else if(-delta>=iter->start) { | |
286 | /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ | |
287 | iter->index=iter->start=iter->reservedField=0; | |
288 | return 0; | |
289 | } else if(delta>=(iter->limit-iter->start)) { | |
290 | /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ | |
291 | iter->index=iter->length; /* may or may not be <0 (unknown) */ | |
292 | iter->start=iter->limit; | |
293 | iter->reservedField=0; | |
294 | return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX; | |
295 | } | |
296 | } | |
297 | ||
298 | /* delta!=0 */ | |
299 | ||
300 | /* move towards the requested position, pin to the edges of the string */ | |
301 | s=(const uint8_t *)iter->context; | |
302 | pos=iter->index; /* could be <0 (unknown) */ | |
303 | i=iter->start; | |
304 | if(delta>0) { | |
305 | /* go forward */ | |
306 | int32_t limit=iter->limit; | |
307 | if(iter->reservedField!=0) { | |
308 | iter->reservedField=0; | |
309 | ++pos; | |
310 | --delta; | |
311 | } | |
312 | while(delta>0 && i<limit) { | |
313 | L8_NEXT(s, i, limit, c); | |
314 | if(c<0xffff) { | |
315 | ++pos; | |
316 | --delta; | |
317 | } else if(delta>=2) { | |
318 | pos+=2; | |
319 | delta-=2; | |
320 | } else /* delta==1 */ { | |
321 | /* stop in the middle of a supplementary code point */ | |
322 | iter->reservedField=c; | |
323 | ++pos; | |
324 | break; /* delta=0; */ | |
325 | } | |
326 | } | |
327 | if(i==limit) { | |
328 | if(iter->length<0 && iter->index>=0) { | |
329 | iter->length= iter->reservedField==0 ? pos : pos+1; | |
330 | } else if(iter->index<0 && iter->length>=0) { | |
331 | iter->index= iter->reservedField==0 ? iter->length : iter->length-1; | |
332 | } | |
333 | } | |
334 | } else /* delta<0 */ { | |
335 | /* go backward */ | |
336 | if(iter->reservedField!=0) { | |
337 | iter->reservedField=0; | |
338 | i-=4; /* we stayed behind the supplementary code point; go before it now */ | |
339 | --pos; | |
340 | ++delta; | |
341 | } | |
342 | while(delta<0 && i>0) { | |
343 | L8_PREV(s, 0, i, c); | |
344 | if(c<0xffff) { | |
345 | --pos; | |
346 | ++delta; | |
347 | } else if(delta<=-2) { | |
348 | pos-=2; | |
349 | delta+=2; | |
350 | } else /* delta==-1 */ { | |
351 | /* stop in the middle of a supplementary code point */ | |
352 | i+=4; /* back to behind this supplementary code point for consistent state */ | |
353 | iter->reservedField=c; | |
354 | --pos; | |
355 | break; /* delta=0; */ | |
356 | } | |
357 | } | |
358 | } | |
359 | ||
360 | iter->start=i; | |
361 | if(iter->index>=0) { | |
362 | return iter->index=pos; | |
363 | } else { | |
364 | /* we started with index<0 (unknown) so pos is bogus */ | |
365 | if(i<=1) { | |
366 | return iter->index=i; /* reached the beginning */ | |
367 | } else { | |
368 | /* we still don't know the UTF-16 index */ | |
369 | return UITER_UNKNOWN_INDEX; | |
370 | } | |
371 | } | |
372 | } | |
373 | ||
374 | static UBool U_CALLCONV | |
375 | lenient8IteratorHasNext(UCharIterator *iter) { | |
376 | return iter->reservedField!=0 || iter->start<iter->limit; | |
377 | } | |
378 | ||
379 | static UBool U_CALLCONV | |
380 | lenient8IteratorHasPrevious(UCharIterator *iter) { | |
381 | return iter->start>0; | |
382 | } | |
383 | ||
384 | static UChar32 U_CALLCONV | |
385 | lenient8IteratorCurrent(UCharIterator *iter) { | |
386 | if(iter->reservedField!=0) { | |
387 | return U16_TRAIL(iter->reservedField); | |
388 | } else if(iter->start<iter->limit) { | |
389 | const uint8_t *s=(const uint8_t *)iter->context; | |
390 | UChar32 c; | |
391 | int32_t i=iter->start; | |
392 | ||
393 | L8_NEXT(s, i, iter->limit, c); | |
394 | if(c<0) { | |
395 | return 0xfffd; | |
396 | } else if(c<=0xffff) { | |
397 | return c; | |
398 | } else { | |
399 | return U16_LEAD(c); | |
400 | } | |
401 | } else { | |
402 | return U_SENTINEL; | |
403 | } | |
404 | } | |
405 | ||
406 | static UChar32 U_CALLCONV | |
407 | lenient8IteratorNext(UCharIterator *iter) { | |
408 | int32_t index; | |
409 | ||
410 | if(iter->reservedField!=0) { | |
411 | UChar trail=U16_TRAIL(iter->reservedField); | |
412 | iter->reservedField=0; | |
413 | if((index=iter->index)>=0) { | |
414 | iter->index=index+1; | |
415 | } | |
416 | return trail; | |
417 | } else if(iter->start<iter->limit) { | |
418 | const uint8_t *s=(const uint8_t *)iter->context; | |
419 | UChar32 c; | |
420 | ||
421 | L8_NEXT(s, iter->start, iter->limit, c); | |
422 | if((index=iter->index)>=0) { | |
423 | iter->index=++index; | |
424 | if(iter->length<0 && iter->start==iter->limit) { | |
425 | iter->length= c<=0xffff ? index : index+1; | |
426 | } | |
427 | } else if(iter->start==iter->limit && iter->length>=0) { | |
428 | iter->index= c<=0xffff ? iter->length : iter->length-1; | |
429 | } | |
430 | if(c<0) { | |
431 | return 0xfffd; | |
432 | } else if(c<=0xffff) { | |
433 | return c; | |
434 | } else { | |
435 | iter->reservedField=c; | |
436 | return U16_LEAD(c); | |
437 | } | |
438 | } else { | |
439 | return U_SENTINEL; | |
440 | } | |
441 | } | |
442 | ||
443 | static UChar32 U_CALLCONV | |
444 | lenient8IteratorPrevious(UCharIterator *iter) { | |
445 | int32_t index; | |
446 | ||
447 | if(iter->reservedField!=0) { | |
448 | UChar lead=U16_LEAD(iter->reservedField); | |
449 | iter->reservedField=0; | |
450 | iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ | |
451 | if((index=iter->index)>0) { | |
452 | iter->index=index-1; | |
453 | } | |
454 | return lead; | |
455 | } else if(iter->start>0) { | |
456 | const uint8_t *s=(const uint8_t *)iter->context; | |
457 | UChar32 c; | |
458 | ||
459 | L8_PREV(s, 0, iter->start, c); | |
460 | if((index=iter->index)>0) { | |
461 | iter->index=index-1; | |
462 | } else if(iter->start<=1) { | |
463 | iter->index= c<=0xffff ? iter->start : iter->start+1; | |
464 | } | |
465 | if(c<0) { | |
466 | return 0xfffd; | |
467 | } else if(c<=0xffff) { | |
468 | return c; | |
469 | } else { | |
470 | iter->start+=4; /* back to behind this supplementary code point for consistent state */ | |
471 | iter->reservedField=c; | |
472 | return U16_TRAIL(c); | |
473 | } | |
474 | } else { | |
475 | return U_SENTINEL; | |
476 | } | |
477 | } | |
478 | ||
479 | static uint32_t U_CALLCONV | |
480 | lenient8IteratorGetState(const UCharIterator *iter) { | |
481 | uint32_t state=(uint32_t)(iter->start<<1); | |
482 | if(iter->reservedField!=0) { | |
483 | state|=1; | |
484 | } | |
485 | return state; | |
486 | } | |
487 | ||
488 | static void U_CALLCONV | |
489 | lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { | |
490 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
491 | /* do nothing */ | |
492 | } else if(iter==NULL) { | |
493 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
494 | } else if(state==lenient8IteratorGetState(iter)) { | |
495 | /* setting to the current state: no-op */ | |
496 | } else { | |
497 | int32_t index=(int32_t)(state>>1); /* UTF-8 index */ | |
498 | state&=1; /* 1 if in surrogate pair, must be index>=4 */ | |
499 | ||
500 | if((state==0 ? index<0 : index<4) || iter->limit<index) { | |
501 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
502 | } else { | |
503 | iter->start=index; /* restore UTF-8 byte index */ | |
504 | if(index<=1) { | |
505 | iter->index=index; | |
506 | } else { | |
507 | iter->index=-1; /* unknown UTF-16 index */ | |
508 | } | |
509 | if(state==0) { | |
510 | iter->reservedField=0; | |
511 | } else { | |
512 | /* verified index>=4 above */ | |
513 | UChar32 c; | |
514 | L8_PREV((const uint8_t *)iter->context, 0, index, c); | |
515 | if(c<=0xffff) { | |
516 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
517 | } else { | |
518 | iter->reservedField=c; | |
519 | } | |
520 | } | |
521 | } | |
522 | } | |
523 | } | |
524 | ||
525 | static const UCharIterator lenient8Iterator={ | |
526 | 0, 0, 0, 0, 0, 0, | |
527 | lenient8IteratorGetIndex, | |
528 | lenient8IteratorMove, | |
529 | lenient8IteratorHasNext, | |
530 | lenient8IteratorHasPrevious, | |
531 | lenient8IteratorCurrent, | |
532 | lenient8IteratorNext, | |
533 | lenient8IteratorPrevious, | |
534 | NULL, | |
535 | lenient8IteratorGetState, | |
536 | lenient8IteratorSetState | |
537 | }; | |
538 | ||
539 | U_CAPI void U_EXPORT2 | |
540 | uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) { | |
541 | if(iter!=0) { | |
542 | if(s!=0 && length>=-1) { | |
543 | *iter=lenient8Iterator; | |
544 | iter->context=s; | |
545 | if(length>=0) { | |
546 | iter->limit=length; | |
547 | } else { | |
548 | iter->limit=strlen(s); | |
549 | } | |
550 | iter->length= iter->limit<=1 ? iter->limit : -1; | |
551 | } else { | |
552 | /* set no-op iterator */ | |
553 | uiter_setString(iter, NULL, 0); | |
554 | } | |
555 | } | |
556 | } |