]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
4 | * Copyright (C) 2003-2004, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ****************************************************************************** | |
8 | * file name: ucnv_ext.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2003jun13 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Conversion extensions | |
17 | */ | |
18 | ||
19 | #include "unicode/utypes.h" | |
20 | ||
21 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION | |
22 | ||
23 | #include "unicode/uset.h" | |
24 | #include "ucnv_bld.h" | |
25 | #include "ucnv_cnv.h" | |
26 | #include "ucnv_ext.h" | |
27 | #include "cmemory.h" | |
28 | ||
29 | /* to Unicode --------------------------------------------------------------- */ | |
30 | ||
31 | /* | |
32 | * @return lookup value for the byte, if found; else 0 | |
33 | */ | |
34 | static U_INLINE uint32_t | |
35 | ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) { | |
36 | uint32_t word0, word; | |
37 | int32_t i, start, limit; | |
38 | ||
39 | /* check the input byte against the lowest and highest section bytes */ | |
40 | start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]); | |
41 | limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]); | |
42 | if(byte<start || limit<byte) { | |
43 | return 0; /* the byte is out of range */ | |
44 | } | |
45 | ||
46 | if(length==((limit-start)+1)) { | |
47 | /* direct access on a linear array */ | |
48 | return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */ | |
49 | } | |
50 | ||
51 | /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ | |
52 | word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0); | |
53 | ||
54 | /* | |
55 | * Shift byte once instead of each section word and add 0xffffff. | |
56 | * We will compare the shifted/added byte (bbffffff) against | |
57 | * section words which have byte values in the same bit position. | |
58 | * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv | |
59 | * for all v=0..f | |
60 | * so we need not mask off the lower 24 bits of each section word. | |
61 | */ | |
62 | word=word0|UCNV_EXT_TO_U_VALUE_MASK; | |
63 | ||
64 | /* binary search */ | |
65 | start=0; | |
66 | limit=length; | |
67 | for(;;) { | |
68 | i=limit-start; | |
69 | if(i<=1) { | |
70 | break; /* done */ | |
71 | } | |
72 | /* start<limit-1 */ | |
73 | ||
74 | if(i<=4) { | |
75 | /* linear search for the last part */ | |
76 | if(word0<=toUSection[start]) { | |
77 | break; | |
78 | } | |
79 | if(++start<limit && word0<=toUSection[start]) { | |
80 | break; | |
81 | } | |
82 | if(++start<limit && word0<=toUSection[start]) { | |
83 | break; | |
84 | } | |
85 | /* always break at start==limit-1 */ | |
86 | ++start; | |
87 | break; | |
88 | } | |
89 | ||
90 | i=(start+limit)/2; | |
91 | if(word<toUSection[i]) { | |
92 | limit=i; | |
93 | } else { | |
94 | start=i; | |
95 | } | |
96 | } | |
97 | ||
98 | /* did we really find it? */ | |
99 | if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) { | |
100 | return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */ | |
101 | } else { | |
102 | return 0; /* not found */ | |
103 | } | |
104 | } | |
105 | ||
106 | /* | |
107 | * TRUE if not an SI/SO stateful converter, | |
108 | * or if the match length fits with the current converter state | |
109 | */ | |
110 | #define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \ | |
111 | ((sisoState)<0 || ((sisoState)==0) == (match==1)) | |
112 | ||
113 | /* | |
114 | * this works like ucnv_extMatchFromU() except | |
115 | * - the first character is in pre | |
116 | * - no trie is used | |
117 | * - the returned matchLength is not offset by 2 | |
118 | */ | |
119 | static int32_t | |
120 | ucnv_extMatchToU(const int32_t *cx, int8_t sisoState, | |
121 | const char *pre, int32_t preLength, | |
122 | const char *src, int32_t srcLength, | |
123 | uint32_t *pMatchValue, | |
124 | UBool useFallback, UBool flush) { | |
125 | const uint32_t *toUTable, *toUSection; | |
126 | ||
127 | uint32_t value, matchValue; | |
128 | int32_t i, j, index, length, matchLength; | |
129 | uint8_t b; | |
130 | ||
131 | if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) { | |
132 | return 0; /* no extension data, no match */ | |
133 | } | |
134 | ||
135 | /* initialize */ | |
136 | toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t); | |
137 | index=0; | |
138 | ||
139 | matchValue=0; | |
140 | i=j=matchLength=0; | |
141 | ||
142 | if(sisoState==0) { | |
143 | /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ | |
144 | if(preLength>1) { | |
145 | return 0; /* no match of a DBCS sequence in SBCS mode */ | |
146 | } else if(preLength==1) { | |
147 | srcLength=0; | |
148 | } else /* preLength==0 */ { | |
149 | if(srcLength>1) { | |
150 | srcLength=1; | |
151 | } | |
152 | } | |
153 | flush=TRUE; | |
154 | } | |
155 | ||
156 | /* we must not remember fallback matches when not using fallbacks */ | |
157 | ||
158 | /* match input units until there is a full match or the input is consumed */ | |
159 | for(;;) { | |
160 | /* go to the next section */ | |
161 | toUSection=toUTable+index; | |
162 | ||
163 | /* read first pair of the section */ | |
164 | value=*toUSection++; | |
165 | length=UCNV_EXT_TO_U_GET_BYTE(value); | |
166 | value=UCNV_EXT_TO_U_GET_VALUE(value); | |
167 | if( value!=0 && | |
168 | (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || | |
169 | TO_U_USE_FALLBACK(useFallback)) && | |
170 | UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) | |
171 | ) { | |
172 | /* remember longest match so far */ | |
173 | matchValue=value; | |
174 | matchLength=i+j; | |
175 | } | |
176 | ||
177 | /* match pre[] then src[] */ | |
178 | if(i<preLength) { | |
179 | b=(uint8_t)pre[i++]; | |
180 | } else if(j<srcLength) { | |
181 | b=(uint8_t)src[j++]; | |
182 | } else { | |
183 | /* all input consumed, partial match */ | |
184 | if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) { | |
185 | /* | |
186 | * end of the entire input stream, stop with the longest match so far | |
187 | * or: partial match must not be longer than UCNV_EXT_MAX_BYTES | |
188 | * because it must fit into state buffers | |
189 | */ | |
190 | break; | |
191 | } else { | |
192 | /* continue with more input next time */ | |
193 | return -length; | |
194 | } | |
195 | } | |
196 | ||
197 | /* search for the current UChar */ | |
198 | value=ucnv_extFindToU(toUSection, length, b); | |
199 | if(value==0) { | |
200 | /* no match here, stop with the longest match so far */ | |
201 | break; | |
202 | } else { | |
203 | if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { | |
204 | /* partial match, continue */ | |
205 | index=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value); | |
206 | } else { | |
207 | if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || | |
208 | TO_U_USE_FALLBACK(useFallback)) && | |
209 | UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) | |
210 | ) { | |
211 | /* full match, stop with result */ | |
212 | matchValue=value; | |
213 | matchLength=i+j; | |
214 | } else { | |
215 | /* full match on fallback not taken, stop with the longest match so far */ | |
216 | } | |
217 | break; | |
218 | } | |
219 | } | |
220 | } | |
221 | ||
222 | if(matchLength==0) { | |
223 | /* no match at all */ | |
224 | return 0; | |
225 | } | |
226 | ||
227 | /* return result */ | |
228 | *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue); | |
229 | return matchLength; | |
230 | } | |
231 | ||
232 | static U_INLINE void | |
233 | ucnv_extWriteToU(UConverter *cnv, const int32_t *cx, | |
234 | uint32_t value, | |
235 | UChar **target, const UChar *targetLimit, | |
236 | int32_t **offsets, int32_t srcIndex, | |
237 | UErrorCode *pErrorCode) { | |
238 | /* output the result */ | |
239 | if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { | |
240 | /* output a single code point */ | |
241 | ucnv_toUWriteCodePoint( | |
242 | cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value), | |
243 | target, targetLimit, | |
244 | offsets, srcIndex, | |
245 | pErrorCode); | |
246 | } else { | |
247 | /* output a string - with correct data we have resultLength>0 */ | |
248 | ucnv_toUWriteUChars( | |
249 | cnv, | |
250 | UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+ | |
251 | UCNV_EXT_TO_U_GET_INDEX(value), | |
252 | UCNV_EXT_TO_U_GET_LENGTH(value), | |
253 | target, targetLimit, | |
254 | offsets, srcIndex, | |
255 | pErrorCode); | |
256 | } | |
257 | } | |
258 | ||
259 | /* | |
260 | * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), | |
261 | * or 1 for DBCS-only, | |
262 | * or -1 if the converter is not SI/SO stateful | |
263 | * | |
264 | * Note: For SI/SO stateful converters getting here, | |
265 | * cnv->mode==0 is equivalent to firstLength==1. | |
266 | */ | |
267 | #define UCNV_SISO_STATE(cnv) \ | |
268 | ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \ | |
269 | (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1) | |
270 | ||
271 | /* | |
272 | * target<targetLimit; set error code for overflow | |
273 | */ | |
274 | U_CFUNC UBool | |
275 | ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx, | |
276 | int32_t firstLength, | |
277 | const char **src, const char *srcLimit, | |
278 | UChar **target, const UChar *targetLimit, | |
279 | int32_t **offsets, int32_t srcIndex, | |
280 | UBool flush, | |
281 | UErrorCode *pErrorCode) { | |
282 | uint32_t value; | |
283 | int32_t match; | |
284 | ||
285 | /* try to match */ | |
286 | match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv), | |
287 | (const char *)cnv->toUBytes, firstLength, | |
288 | *src, (int32_t)(srcLimit-*src), | |
289 | &value, | |
290 | cnv->useFallback, flush); | |
291 | if(match>0) { | |
292 | /* advance src pointer for the consumed input */ | |
293 | *src+=match-firstLength; | |
294 | ||
295 | /* write result to target */ | |
296 | ucnv_extWriteToU(cnv, cx, | |
297 | value, | |
298 | target, targetLimit, | |
299 | offsets, srcIndex, | |
300 | pErrorCode); | |
301 | return TRUE; | |
302 | } else if(match<0) { | |
303 | /* save state for partial match */ | |
304 | const char *s; | |
305 | int32_t j; | |
306 | ||
307 | /* copy the first code point */ | |
308 | s=(const char *)cnv->toUBytes; | |
309 | cnv->preToUFirstLength=(int8_t)firstLength; | |
310 | for(j=0; j<firstLength; ++j) { | |
311 | cnv->preToU[j]=*s++; | |
312 | } | |
313 | ||
314 | /* now copy the newly consumed input */ | |
315 | s=*src; | |
316 | match=-match; | |
317 | for(; j<match; ++j) { | |
318 | cnv->preToU[j]=*s++; | |
319 | } | |
320 | *src=s; /* same as *src=srcLimit; because we reached the end of input */ | |
321 | cnv->preToULength=(int8_t)match; | |
322 | return TRUE; | |
323 | } else /* match==0 no match */ { | |
324 | return FALSE; | |
325 | } | |
326 | } | |
327 | ||
328 | U_CFUNC UChar32 | |
329 | ucnv_extSimpleMatchToU(const int32_t *cx, | |
330 | const char *source, int32_t length, | |
331 | UBool useFallback) { | |
332 | uint32_t value; | |
333 | int32_t match; | |
334 | ||
335 | if(length<=0) { | |
336 | return 0xffff; | |
337 | } | |
338 | ||
339 | /* try to match */ | |
340 | match=ucnv_extMatchToU(cx, -1, | |
341 | source, length, | |
342 | NULL, 0, | |
343 | &value, | |
344 | useFallback, TRUE); | |
345 | if(match==length) { | |
346 | /* write result for simple, single-character conversion */ | |
347 | if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { | |
348 | return UCNV_EXT_TO_U_GET_CODE_POINT(value); | |
349 | } | |
350 | } | |
351 | ||
352 | /* | |
353 | * return no match because | |
354 | * - match>0 && value points to string: simple conversion cannot handle multiple code points | |
355 | * - match>0 && match!=length: not all input consumed, forbidden for this function | |
356 | * - match==0: no match found in the first place | |
357 | * - match<0: partial match, not supported for simple conversion (and flush==TRUE) | |
358 | */ | |
359 | return 0xfffe; | |
360 | } | |
361 | ||
362 | /* | |
363 | * continue partial match with new input | |
364 | * never called for simple, single-character conversion | |
365 | */ | |
366 | U_CFUNC void | |
367 | ucnv_extContinueMatchToU(UConverter *cnv, | |
368 | UConverterToUnicodeArgs *pArgs, int32_t srcIndex, | |
369 | UErrorCode *pErrorCode) { | |
370 | uint32_t value; | |
371 | int32_t match, length; | |
372 | ||
373 | match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv), | |
374 | cnv->preToU, cnv->preToULength, | |
375 | pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), | |
376 | &value, | |
377 | cnv->useFallback, pArgs->flush); | |
378 | if(match>0) { | |
379 | if(match>=cnv->preToULength) { | |
380 | /* advance src pointer for the consumed input */ | |
381 | pArgs->source+=match-cnv->preToULength; | |
382 | cnv->preToULength=0; | |
383 | } else { | |
384 | /* the match did not use all of preToU[] - keep the rest for replay */ | |
385 | length=cnv->preToULength-match; | |
386 | uprv_memmove(cnv->preToU, cnv->preToU+match, length); | |
387 | cnv->preToULength=(int8_t)-length; | |
388 | } | |
389 | ||
390 | /* write result */ | |
391 | ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes, | |
392 | value, | |
393 | &pArgs->target, pArgs->targetLimit, | |
394 | &pArgs->offsets, srcIndex, | |
395 | pErrorCode); | |
396 | } else if(match<0) { | |
397 | /* save state for partial match */ | |
398 | const char *s; | |
399 | int32_t j; | |
400 | ||
401 | /* just _append_ the newly consumed input to preToU[] */ | |
402 | s=pArgs->source; | |
403 | match=-match; | |
404 | for(j=cnv->preToULength; j<match; ++j) { | |
405 | cnv->preToU[j]=*s++; | |
406 | } | |
407 | pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ | |
408 | cnv->preToULength=(int8_t)match; | |
409 | } else /* match==0 */ { | |
410 | /* | |
411 | * no match | |
412 | * | |
413 | * We need to split the previous input into two parts: | |
414 | * | |
415 | * 1. The first codepage character is unmappable - that's how we got into | |
416 | * trying the extension data in the first place. | |
417 | * We need to move it from the preToU buffer | |
418 | * to the error buffer, set an error code, | |
419 | * and prepare the rest of the previous input for 2. | |
420 | * | |
421 | * 2. The rest of the previous input must be converted once we | |
422 | * come back from the callback for the first character. | |
423 | * At that time, we have to try again from scratch to convert | |
424 | * these input characters. | |
425 | * The replay will be handled by the ucnv.c conversion code. | |
426 | */ | |
427 | ||
428 | /* move the first codepage character to the error field */ | |
429 | uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength); | |
430 | cnv->toULength=cnv->preToUFirstLength; | |
431 | ||
432 | /* move the rest up inside the buffer */ | |
433 | length=cnv->preToULength-cnv->preToUFirstLength; | |
434 | if(length>0) { | |
435 | uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length); | |
436 | } | |
437 | ||
438 | /* mark preToU for replay */ | |
439 | cnv->preToULength=(int8_t)-length; | |
440 | ||
441 | /* set the error code for unassigned */ | |
442 | *pErrorCode=U_INVALID_CHAR_FOUND; | |
443 | } | |
444 | } | |
445 | ||
446 | /* from Unicode ------------------------------------------------------------- */ | |
447 | ||
448 | /* | |
449 | * @return index of the UChar, if found; else <0 | |
450 | */ | |
451 | static U_INLINE int32_t | |
452 | ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) { | |
453 | int32_t i, start, limit; | |
454 | ||
455 | /* binary search */ | |
456 | start=0; | |
457 | limit=length; | |
458 | for(;;) { | |
459 | i=limit-start; | |
460 | if(i<=1) { | |
461 | break; /* done */ | |
462 | } | |
463 | /* start<limit-1 */ | |
464 | ||
465 | if(i<=4) { | |
466 | /* linear search for the last part */ | |
467 | if(u<=fromUSection[start]) { | |
468 | break; | |
469 | } | |
470 | if(++start<limit && u<=fromUSection[start]) { | |
471 | break; | |
472 | } | |
473 | if(++start<limit && u<=fromUSection[start]) { | |
474 | break; | |
475 | } | |
476 | /* always break at start==limit-1 */ | |
477 | ++start; | |
478 | break; | |
479 | } | |
480 | ||
481 | i=(start+limit)/2; | |
482 | if(u<fromUSection[i]) { | |
483 | limit=i; | |
484 | } else { | |
485 | start=i; | |
486 | } | |
487 | } | |
488 | ||
489 | /* did we really find it? */ | |
490 | if(start<limit && u==fromUSection[start]) { | |
491 | return start; | |
492 | } else { | |
493 | return -1; /* not found */ | |
494 | } | |
495 | } | |
496 | ||
497 | /* | |
498 | * @param cx pointer to extension data; if NULL, returns 0 | |
499 | * @param firstCP the first code point before all the other UChars | |
500 | * @param pre UChars that must match; !initialMatch: partial match with them | |
501 | * @param preLength length of pre, >=0 | |
502 | * @param src UChars that can be used to complete a match | |
503 | * @param srcLength length of src, >=0 | |
504 | * @param pMatchValue [out] output result value for the match from the data structure | |
505 | * @param useFallback "use fallback" flag, usually from cnv->useFallback | |
506 | * @param flush TRUE if the end of the input stream is reached | |
507 | * @return >1: matched, return value=total match length (number of input units matched) | |
508 | * 1: matched, no mapping but request for <subchar1> | |
509 | * (only for the first code point) | |
510 | * 0: no match | |
511 | * <0: partial match, return value=negative total match length | |
512 | * (partial matches are never returned for flush==TRUE) | |
513 | * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) | |
514 | * the matchLength is 2 if only firstCP matched, and >2 if firstCP and | |
515 | * further code units matched | |
516 | */ | |
517 | static int32_t | |
518 | ucnv_extMatchFromU(const int32_t *cx, | |
519 | UChar32 firstCP, | |
520 | const UChar *pre, int32_t preLength, | |
521 | const UChar *src, int32_t srcLength, | |
522 | uint32_t *pMatchValue, | |
523 | UBool useFallback, UBool flush) { | |
524 | const uint16_t *stage12, *stage3; | |
525 | const uint32_t *stage3b; | |
526 | ||
527 | const UChar *fromUTableUChars, *fromUSectionUChars; | |
528 | const uint32_t *fromUTableValues, *fromUSectionValues; | |
529 | ||
530 | uint32_t value, matchValue; | |
531 | int32_t i, j, index, length, matchLength; | |
532 | UChar c; | |
533 | ||
534 | if(cx==NULL) { | |
535 | return 0; /* no extension data, no match */ | |
536 | } | |
537 | ||
538 | /* trie lookup of firstCP */ | |
539 | index=firstCP>>10; /* stage 1 index */ | |
540 | if(index>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) { | |
541 | return 0; /* the first code point is outside the trie */ | |
542 | } | |
543 | ||
544 | stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); | |
545 | stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); | |
546 | index=UCNV_EXT_FROM_U(stage12, stage3, index, firstCP); | |
547 | ||
548 | stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); | |
549 | value=stage3b[index]; | |
550 | if(value==0) { | |
551 | return 0; | |
552 | } | |
553 | ||
554 | if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { | |
555 | /* partial match, enter the loop below */ | |
556 | index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); | |
557 | ||
558 | /* initialize */ | |
559 | fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar); | |
560 | fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t); | |
561 | ||
562 | matchValue=0; | |
563 | i=j=matchLength=0; | |
564 | ||
565 | /* we must not remember fallback matches when not using fallbacks */ | |
566 | ||
567 | /* match input units until there is a full match or the input is consumed */ | |
568 | for(;;) { | |
569 | /* go to the next section */ | |
570 | fromUSectionUChars=fromUTableUChars+index; | |
571 | fromUSectionValues=fromUTableValues+index; | |
572 | ||
573 | /* read first pair of the section */ | |
574 | length=*fromUSectionUChars++; | |
575 | value=*fromUSectionValues++; | |
576 | if( value!=0 && | |
577 | (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || | |
578 | FROM_U_USE_FALLBACK(useFallback, firstCP)) | |
579 | ) { | |
580 | /* remember longest match so far */ | |
581 | matchValue=value; | |
582 | matchLength=2+i+j; | |
583 | } | |
584 | ||
585 | /* match pre[] then src[] */ | |
586 | if(i<preLength) { | |
587 | c=pre[i++]; | |
588 | } else if(j<srcLength) { | |
589 | c=src[j++]; | |
590 | } else { | |
591 | /* all input consumed, partial match */ | |
592 | if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) { | |
593 | /* | |
594 | * end of the entire input stream, stop with the longest match so far | |
595 | * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS | |
596 | * because it must fit into state buffers | |
597 | */ | |
598 | break; | |
599 | } else { | |
600 | /* continue with more input next time */ | |
601 | return -(2+length); | |
602 | } | |
603 | } | |
604 | ||
605 | /* search for the current UChar */ | |
606 | index=ucnv_extFindFromU(fromUSectionUChars, length, c); | |
607 | if(index<0) { | |
608 | /* no match here, stop with the longest match so far */ | |
609 | break; | |
610 | } else { | |
611 | value=fromUSectionValues[index]; | |
612 | if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { | |
613 | /* partial match, continue */ | |
614 | index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); | |
615 | } else { | |
616 | if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || | |
617 | FROM_U_USE_FALLBACK(useFallback, firstCP) | |
618 | ) { | |
619 | /* full match, stop with result */ | |
620 | matchValue=value; | |
621 | matchLength=2+i+j; | |
622 | } else { | |
623 | /* full match on fallback not taken, stop with the longest match so far */ | |
624 | } | |
625 | break; | |
626 | } | |
627 | } | |
628 | } | |
629 | ||
630 | if(matchLength==0) { | |
631 | /* no match at all */ | |
632 | return 0; | |
633 | } | |
634 | } else /* result from firstCP trie lookup */ { | |
635 | if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || | |
636 | FROM_U_USE_FALLBACK(useFallback, firstCP) | |
637 | ) { | |
638 | /* full match, stop with result */ | |
639 | matchValue=value; | |
640 | matchLength=2; | |
641 | } else { | |
642 | /* fallback not taken */ | |
643 | return 0; | |
644 | } | |
645 | } | |
646 | ||
647 | if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) { | |
648 | /* do not interpret values with reserved bits used, for forward compatibility */ | |
649 | return 0; | |
650 | } | |
651 | ||
652 | /* return result */ | |
653 | if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { | |
654 | return 1; /* assert matchLength==2 */ | |
655 | } | |
656 | ||
657 | *pMatchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue); | |
658 | return matchLength; | |
659 | } | |
660 | ||
661 | static U_INLINE void | |
662 | ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, | |
663 | uint32_t value, | |
664 | char **target, const char *targetLimit, | |
665 | int32_t **offsets, int32_t srcIndex, | |
666 | UErrorCode *pErrorCode) { | |
667 | uint8_t buffer[1+UCNV_EXT_MAX_BYTES]; | |
668 | const uint8_t *result; | |
669 | int32_t length, prevLength; | |
670 | ||
671 | length=UCNV_EXT_FROM_U_GET_LENGTH(value); | |
672 | value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); | |
673 | ||
674 | /* output the result */ | |
675 | if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { | |
676 | /* | |
677 | * Generate a byte array and then write it below. | |
678 | * This is not the fastest possible way, but it should be ok for | |
679 | * extension mappings, and it is much simpler. | |
680 | * Offset and overflow handling are only done once this way. | |
681 | */ | |
682 | uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */ | |
683 | switch(length) { | |
684 | case 3: | |
685 | *p++=(uint8_t)(value>>16); | |
686 | case 2: | |
687 | *p++=(uint8_t)(value>>8); | |
688 | case 1: | |
689 | *p++=(uint8_t)value; | |
690 | default: | |
691 | break; /* will never occur */ | |
692 | } | |
693 | result=buffer+1; | |
694 | } else { | |
695 | result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; | |
696 | } | |
697 | ||
698 | /* with correct data we have length>0 */ | |
699 | ||
700 | if((prevLength=cnv->fromUnicodeStatus)!=0) { | |
701 | /* handle SI/SO stateful output */ | |
702 | uint8_t shiftByte; | |
703 | ||
704 | if(prevLength>1 && length==1) { | |
705 | /* change from double-byte mode to single-byte */ | |
706 | shiftByte=(uint8_t)UCNV_SI; | |
707 | cnv->fromUnicodeStatus=1; | |
708 | } else if(prevLength==1 && length>1) { | |
709 | /* change from single-byte mode to double-byte */ | |
710 | shiftByte=(uint8_t)UCNV_SO; | |
711 | cnv->fromUnicodeStatus=2; | |
712 | } else { | |
713 | shiftByte=0; | |
714 | } | |
715 | ||
716 | if(shiftByte!=0) { | |
717 | /* prepend the shift byte to the result bytes */ | |
718 | buffer[0]=shiftByte; | |
719 | if(result!=buffer+1) { | |
720 | uprv_memcpy(buffer+1, result, length); | |
721 | } | |
722 | result=buffer; | |
723 | ++length; | |
724 | } | |
725 | } | |
726 | ||
727 | ucnv_fromUWriteBytes(cnv, (const char *)result, length, | |
728 | target, targetLimit, | |
729 | offsets, srcIndex, | |
730 | pErrorCode); | |
731 | } | |
732 | ||
733 | /* | |
734 | * target<targetLimit; set error code for overflow | |
735 | */ | |
736 | U_CFUNC UBool | |
737 | ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx, | |
738 | UChar32 cp, | |
739 | const UChar **src, const UChar *srcLimit, | |
740 | char **target, const char *targetLimit, | |
741 | int32_t **offsets, int32_t srcIndex, | |
742 | UBool flush, | |
743 | UErrorCode *pErrorCode) { | |
744 | uint32_t value; | |
745 | int32_t match; | |
746 | ||
747 | /* try to match */ | |
748 | match=ucnv_extMatchFromU(cx, cp, | |
749 | NULL, 0, | |
750 | *src, (int32_t)(srcLimit-*src), | |
751 | &value, | |
752 | cnv->useFallback, flush); | |
753 | ||
754 | /* reject a match if the result is a single byte for DBCS-only */ | |
755 | if( match>=2 && | |
756 | !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 && | |
757 | cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) | |
758 | ) { | |
759 | /* advance src pointer for the consumed input */ | |
760 | *src+=match-2; /* remove 2 for the initial code point */ | |
761 | ||
762 | /* write result to target */ | |
763 | ucnv_extWriteFromU(cnv, cx, | |
764 | value, | |
765 | target, targetLimit, | |
766 | offsets, srcIndex, | |
767 | pErrorCode); | |
768 | return TRUE; | |
769 | } else if(match<0) { | |
770 | /* save state for partial match */ | |
771 | const UChar *s; | |
772 | int32_t j; | |
773 | ||
774 | /* copy the first code point */ | |
775 | cnv->preFromUFirstCP=cp; | |
776 | ||
777 | /* now copy the newly consumed input */ | |
778 | s=*src; | |
779 | match=-match-2; /* remove 2 for the initial code point */ | |
780 | for(j=0; j<match; ++j) { | |
781 | cnv->preFromU[j]=*s++; | |
782 | } | |
783 | *src=s; /* same as *src=srcLimit; because we reached the end of input */ | |
784 | cnv->preFromULength=(int8_t)match; | |
785 | return TRUE; | |
786 | } else if(match==1) { | |
787 | /* matched, no mapping but request for <subchar1> */ | |
788 | cnv->useSubChar1=TRUE; | |
789 | return FALSE; | |
790 | } else /* match==0 no match */ { | |
791 | return FALSE; | |
792 | } | |
793 | } | |
794 | ||
795 | U_CFUNC int32_t | |
796 | ucnv_extSimpleMatchFromU(const int32_t *cx, | |
797 | UChar32 cp, uint32_t *pValue, | |
798 | UBool useFallback) { | |
799 | uint32_t value; | |
800 | int32_t match; | |
801 | ||
802 | /* try to match */ | |
803 | match=ucnv_extMatchFromU(cx, | |
804 | cp, | |
805 | NULL, 0, | |
806 | NULL, 0, | |
807 | &value, | |
808 | useFallback, TRUE); | |
809 | if(match>=2) { | |
810 | /* write result for simple, single-character conversion */ | |
811 | int32_t length; | |
812 | ||
813 | length=UCNV_EXT_FROM_U_GET_LENGTH(value); | |
814 | value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); | |
815 | ||
816 | if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { | |
817 | *pValue=value; | |
818 | return length; | |
819 | #if 0 /* not currently used */ | |
820 | } else if(length==4) { | |
821 | /* de-serialize a 4-byte result */ | |
822 | const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; | |
823 | *pValue= | |
824 | ((uint32_t)result[0]<<24)| | |
825 | ((uint32_t)result[1]<<16)| | |
826 | ((uint32_t)result[2]<<8)| | |
827 | result[3]; | |
828 | return 4; | |
829 | #endif | |
830 | } | |
831 | } | |
832 | ||
833 | /* | |
834 | * return no match because | |
835 | * - match>1 && resultLength>4: result too long for simple conversion | |
836 | * - match==1: no match found, <subchar1> preferred | |
837 | * - match==0: no match found in the first place | |
838 | * - match<0: partial match, not supported for simple conversion (and flush==TRUE) | |
839 | */ | |
840 | return 0; | |
841 | } | |
842 | ||
843 | /* | |
844 | * continue partial match with new input, requires cnv->preFromUFirstCP>=0 | |
845 | * never called for simple, single-character conversion | |
846 | */ | |
847 | U_CFUNC void | |
848 | ucnv_extContinueMatchFromU(UConverter *cnv, | |
849 | UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, | |
850 | UErrorCode *pErrorCode) { | |
851 | uint32_t value; | |
852 | int32_t match; | |
853 | ||
854 | match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes, | |
855 | cnv->preFromUFirstCP, | |
856 | cnv->preFromU, cnv->preFromULength, | |
857 | pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), | |
858 | &value, | |
859 | cnv->useFallback, pArgs->flush); | |
860 | if(match>=2) { | |
861 | match-=2; /* remove 2 for the initial code point */ | |
862 | ||
863 | if(match>=cnv->preFromULength) { | |
864 | /* advance src pointer for the consumed input */ | |
865 | pArgs->source+=match-cnv->preFromULength; | |
866 | cnv->preFromULength=0; | |
867 | } else { | |
868 | /* the match did not use all of preFromU[] - keep the rest for replay */ | |
869 | int32_t length=cnv->preFromULength-match; | |
870 | uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR); | |
871 | cnv->preFromULength=(int8_t)-length; | |
872 | } | |
873 | ||
874 | /* finish the partial match */ | |
875 | cnv->preFromUFirstCP=U_SENTINEL; | |
876 | ||
877 | /* write result */ | |
878 | ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes, | |
879 | value, | |
880 | &pArgs->target, pArgs->targetLimit, | |
881 | &pArgs->offsets, srcIndex, | |
882 | pErrorCode); | |
883 | } else if(match<0) { | |
884 | /* save state for partial match */ | |
885 | const UChar *s; | |
886 | int32_t j; | |
887 | ||
888 | /* just _append_ the newly consumed input to preFromU[] */ | |
889 | s=pArgs->source; | |
890 | match=-match-2; /* remove 2 for the initial code point */ | |
891 | for(j=cnv->preFromULength; j<match; ++j) { | |
892 | cnv->preFromU[j]=*s++; | |
893 | } | |
894 | pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ | |
895 | cnv->preFromULength=(int8_t)match; | |
896 | } else /* match==0 or 1 */ { | |
897 | /* | |
898 | * no match | |
899 | * | |
900 | * We need to split the previous input into two parts: | |
901 | * | |
902 | * 1. The first code point is unmappable - that's how we got into | |
903 | * trying the extension data in the first place. | |
904 | * We need to move it from the preFromU buffer | |
905 | * to the error buffer, set an error code, | |
906 | * and prepare the rest of the previous input for 2. | |
907 | * | |
908 | * 2. The rest of the previous input must be converted once we | |
909 | * come back from the callback for the first code point. | |
910 | * At that time, we have to try again from scratch to convert | |
911 | * these input characters. | |
912 | * The replay will be handled by the ucnv.c conversion code. | |
913 | */ | |
914 | ||
915 | if(match==1) { | |
916 | /* matched, no mapping but request for <subchar1> */ | |
917 | cnv->useSubChar1=TRUE; | |
918 | } | |
919 | ||
920 | /* move the first code point to the error field */ | |
921 | cnv->fromUChar32=cnv->preFromUFirstCP; | |
922 | cnv->preFromUFirstCP=U_SENTINEL; | |
923 | ||
924 | /* mark preFromU for replay */ | |
925 | cnv->preFromULength=-cnv->preFromULength; | |
926 | ||
927 | /* set the error code for unassigned */ | |
928 | *pErrorCode=U_INVALID_CHAR_FOUND; | |
929 | } | |
930 | } | |
931 | ||
932 | static void | |
933 | ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, | |
934 | const int32_t *cx, | |
73c04bcf | 935 | const USetAdder *sa, |
374ca955 A |
936 | UConverterUnicodeSet which, |
937 | int32_t minLength, | |
938 | UChar32 c, | |
939 | UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, | |
940 | int32_t sectionIndex, | |
941 | UErrorCode *pErrorCode) { | |
942 | const UChar *fromUSectionUChars; | |
943 | const uint32_t *fromUSectionValues; | |
944 | ||
945 | uint32_t value; | |
946 | int32_t i, count; | |
947 | ||
948 | fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex; | |
949 | fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex; | |
950 | ||
951 | /* read first pair of the section */ | |
952 | count=*fromUSectionUChars++; | |
953 | value=*fromUSectionValues++; | |
954 | ||
955 | if( value!=0 && | |
956 | UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) && | |
957 | UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength | |
958 | ) { | |
959 | if(c>=0) { | |
960 | /* add the initial code point */ | |
961 | sa->add(sa->set, c); | |
962 | } else { | |
963 | /* add the string so far */ | |
964 | sa->addString(sa->set, s, length); | |
965 | } | |
966 | } | |
967 | ||
968 | for(i=0; i<count; ++i) { | |
969 | /* append this code unit and recurse or add the string */ | |
970 | s[length]=fromUSectionUChars[i]; | |
971 | value=fromUSectionValues[i]; | |
972 | ||
973 | if(value==0) { | |
974 | /* no mapping, do nothing */ | |
975 | } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { | |
976 | ucnv_extGetUnicodeSetString( | |
977 | sharedData, cx, sa, which, minLength, | |
978 | U_SENTINEL, s, length+1, | |
979 | (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), | |
980 | pErrorCode); | |
981 | } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== | |
982 | UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && | |
983 | UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength | |
984 | ) { | |
985 | sa->addString(sa->set, s, length+1); | |
986 | } | |
987 | } | |
988 | } | |
989 | ||
990 | U_CFUNC void | |
991 | ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, | |
73c04bcf | 992 | const USetAdder *sa, |
374ca955 A |
993 | UConverterUnicodeSet which, |
994 | UErrorCode *pErrorCode) { | |
995 | const int32_t *cx; | |
996 | const uint16_t *stage12, *stage3, *ps2, *ps3; | |
997 | const uint32_t *stage3b; | |
998 | ||
999 | uint32_t value; | |
1000 | int32_t st1, stage1Length, st2, st3, minLength; | |
1001 | ||
1002 | UChar s[UCNV_EXT_MAX_UCHARS]; | |
1003 | UChar32 c; | |
1004 | int32_t length; | |
1005 | ||
1006 | cx=sharedData->mbcs.extIndexes; | |
1007 | if(cx==NULL) { | |
1008 | return; | |
1009 | } | |
1010 | ||
1011 | stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); | |
1012 | stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); | |
1013 | stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); | |
1014 | ||
1015 | stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; | |
1016 | ||
1017 | /* enumerate the from-Unicode trie table */ | |
1018 | c=0; /* keep track of the current code point while enumerating */ | |
1019 | ||
1020 | if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) { | |
1021 | /* DBCS-only, ignore single-byte results */ | |
1022 | minLength=2; | |
1023 | } else { | |
1024 | minLength=1; | |
1025 | } | |
1026 | ||
1027 | /* | |
1028 | * the trie enumeration is almost the same as | |
1029 | * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1 | |
1030 | */ | |
1031 | for(st1=0; st1<stage1Length; ++st1) { | |
1032 | st2=stage12[st1]; | |
1033 | if(st2>stage1Length) { | |
1034 | ps2=stage12+st2; | |
1035 | for(st2=0; st2<64; ++st2) { | |
1036 | if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) { | |
1037 | /* read the stage 3 block */ | |
1038 | ps3=stage3+st3; | |
1039 | ||
1040 | /* | |
1041 | * Add code points for which the roundtrip flag is set. | |
1042 | * Do not add <subchar1> entries or other (future?) pseudo-entries | |
1043 | * with an output length of 0, or entries with reserved bits set. | |
1044 | * Recurse for partial results. | |
1045 | */ | |
1046 | do { | |
1047 | value=stage3b[*ps3++]; | |
1048 | if(value==0) { | |
1049 | /* no mapping, do nothing */ | |
1050 | } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { | |
1051 | length=0; | |
1052 | U16_APPEND_UNSAFE(s, length, c); | |
1053 | ucnv_extGetUnicodeSetString( | |
1054 | sharedData, cx, sa, which, minLength, | |
1055 | c, s, length, | |
1056 | (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), | |
1057 | pErrorCode); | |
1058 | } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== | |
1059 | UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && | |
1060 | UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength | |
1061 | ) { | |
1062 | sa->add(sa->set, c); | |
1063 | } | |
1064 | } while((++c&0xf)!=0); | |
1065 | } else { | |
1066 | c+=16; /* empty stage 3 block */ | |
1067 | } | |
1068 | } | |
1069 | } else { | |
1070 | c+=1024; /* empty stage 2 block */ | |
1071 | } | |
1072 | } | |
1073 | } | |
1074 | ||
1075 | #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |