]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
4 | * Copyright (C) 2000-2001, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ****************************************************************************** | |
8 | * file name: ubidiwrt.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 1999aug06 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * This file contains implementations for BiDi functions that use | |
17 | * the core algorithm and core API to write reordered text. | |
18 | */ | |
19 | ||
20 | /* set import/export definitions */ | |
21 | #ifndef U_COMMON_IMPLEMENTATION | |
22 | # define U_COMMON_IMPLEMENTATION | |
23 | #endif | |
24 | ||
25 | #include "unicode/utypes.h" | |
26 | #include "unicode/ustring.h" | |
27 | #include "unicode/uchar.h" | |
28 | #include "unicode/ubidi.h" | |
29 | #include "cmemory.h" | |
30 | #include "ustr_imp.h" | |
31 | #include "ubidiimp.h" | |
32 | ||
33 | /* | |
34 | * The function implementations in this file are designed | |
35 | * for UTF-16 and UTF-32, not for UTF-8. | |
36 | * | |
37 | * Assumptions that are not true for UTF-8: | |
38 | * - Any code point always needs the same number of code units | |
39 | * ("minimum-length-problem" of UTF-8) | |
40 | * - The BiDi control characters need only one code unit each | |
41 | * | |
42 | * Further assumptions for all UTFs: | |
43 | * - u_charMirror(c) needs the same number of code units as c | |
44 | */ | |
45 | #if UTF_SIZE==8 | |
46 | # error reimplement ubidi_writeReordered() for UTF-8, see comment above | |
47 | #endif | |
48 | ||
49 | /** BiDi control code points */ | |
50 | enum { | |
51 | LRM_CHAR=0x200e, | |
52 | RLM_CHAR, | |
53 | LRE_CHAR=0x202a, | |
54 | RLE_CHAR, | |
55 | PDF_CHAR, | |
56 | LRO_CHAR, | |
57 | RLO_CHAR | |
58 | }; | |
59 | ||
60 | #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffe)==LRM_CHAR || (uint32_t)((c)-LRE_CHAR)<5) | |
61 | #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK)) | |
62 | ||
63 | /* | |
64 | * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we | |
65 | * semantically write RTL runs in reverse and later reverse them again. | |
66 | * Instead, we actually write them in forward order to begin with. | |
67 | * However, if the RTL run was to be mirrored, we need to mirror here now | |
68 | * since the implicit second reversal must not do it. | |
69 | * It looks strange to do mirroring in LTR output, but it is only because | |
70 | * we are writing RTL output in reverse. | |
71 | */ | |
72 | static int32_t | |
73 | doWriteForward(const UChar *src, int32_t srcLength, | |
74 | UChar *dest, int32_t destSize, | |
75 | uint16_t options, | |
76 | UErrorCode *pErrorCode) { | |
77 | /* optimize for several combinations of options */ | |
78 | switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) { | |
79 | case 0: { | |
80 | /* simply copy the LTR run to the destination */ | |
81 | int32_t length=srcLength; | |
82 | if(destSize<length) { | |
83 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
84 | return srcLength; | |
85 | } | |
86 | do { | |
87 | *dest++=*src++; | |
88 | } while(--length>0); | |
89 | return srcLength; | |
90 | } | |
91 | case UBIDI_DO_MIRRORING: { | |
92 | /* do mirroring */ | |
93 | int32_t i=0, j=0; | |
94 | UChar32 c; | |
95 | ||
96 | if(destSize<srcLength) { | |
97 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
98 | return srcLength; | |
99 | } | |
100 | do { | |
101 | UTF_NEXT_CHAR(src, i, srcLength, c); | |
102 | c=u_charMirror(c); | |
103 | UTF_APPEND_CHAR_UNSAFE(dest, j, c); | |
104 | } while(i<srcLength); | |
105 | return srcLength; | |
106 | } | |
107 | case UBIDI_REMOVE_BIDI_CONTROLS: { | |
108 | /* copy the LTR run and remove any BiDi control characters */ | |
109 | int32_t remaining=destSize; | |
110 | UChar c; | |
111 | do { | |
112 | c=*src++; | |
113 | if(!IS_BIDI_CONTROL_CHAR(c)) { | |
114 | if(--remaining<0) { | |
115 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
116 | ||
117 | /* preflight the length */ | |
118 | while(--srcLength>0) { | |
119 | c=*src++; | |
120 | if(!IS_BIDI_CONTROL_CHAR(c)) { | |
121 | --remaining; | |
122 | } | |
123 | } | |
124 | return destSize-remaining; | |
125 | } | |
126 | *dest++=c; | |
127 | } | |
128 | } while(--srcLength>0); | |
129 | return destSize-remaining; | |
130 | } | |
131 | default: { | |
132 | /* remove BiDi control characters and do mirroring */ | |
133 | int32_t remaining=destSize; | |
134 | int32_t i, j=0; | |
135 | UChar32 c; | |
136 | do { | |
137 | i=0; | |
138 | UTF_NEXT_CHAR(src, i, srcLength, c); | |
139 | src+=i; | |
140 | srcLength-=i; | |
141 | if(!IS_BIDI_CONTROL_CHAR(c)) { | |
142 | remaining-=i; | |
143 | if(remaining<0) { | |
144 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
145 | ||
146 | /* preflight the length */ | |
147 | while(srcLength>0) { | |
148 | c=*src++; | |
149 | if(!IS_BIDI_CONTROL_CHAR(c)) { | |
150 | --remaining; | |
151 | } | |
152 | --srcLength; | |
153 | } | |
154 | return destSize-remaining; | |
155 | } | |
156 | c=u_charMirror(c); | |
157 | UTF_APPEND_CHAR_UNSAFE(dest, j, c); | |
158 | } | |
159 | } while(srcLength>0); | |
160 | return j; | |
161 | } | |
162 | } /* end of switch */ | |
163 | } | |
164 | ||
165 | static int32_t | |
166 | doWriteReverse(const UChar *src, int32_t srcLength, | |
167 | UChar *dest, int32_t destSize, | |
168 | uint16_t options, | |
169 | UErrorCode *pErrorCode) { | |
170 | /* | |
171 | * RTL run - | |
172 | * | |
173 | * RTL runs need to be copied to the destination in reverse order | |
174 | * of code points, not code units, to keep Unicode characters intact. | |
175 | * | |
176 | * The general strategy for this is to read the source text | |
177 | * in backward order, collect all code units for a code point | |
178 | * (and optionally following combining characters, see below), | |
179 | * and copy all these code units in ascending order | |
180 | * to the destination for this run. | |
181 | * | |
182 | * Several options request whether combining characters | |
183 | * should be kept after their base characters, | |
184 | * whether BiDi control characters should be removed, and | |
185 | * whether characters should be replaced by their mirror-image | |
186 | * equivalent Unicode characters. | |
187 | */ | |
188 | int32_t i, j; | |
189 | UChar32 c; | |
190 | ||
191 | /* optimize for several combinations of options */ | |
192 | switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) { | |
193 | case 0: | |
194 | /* | |
195 | * With none of the "complicated" options set, the destination | |
196 | * run will have the same length as the source run, | |
197 | * and there is no mirroring and no keeping combining characters | |
198 | * with their base characters. | |
199 | */ | |
200 | if(destSize<srcLength) { | |
201 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
202 | return srcLength; | |
203 | } | |
204 | destSize=srcLength; | |
205 | ||
206 | /* preserve character integrity */ | |
207 | do { | |
208 | /* i is always after the last code unit known to need to be kept in this segment */ | |
209 | i=srcLength; | |
210 | ||
211 | /* collect code units for one base character */ | |
212 | UTF_BACK_1(src, 0, srcLength); | |
213 | ||
214 | /* copy this base character */ | |
215 | j=srcLength; | |
216 | do { | |
217 | *dest++=src[j++]; | |
218 | } while(j<i); | |
219 | } while(srcLength>0); | |
220 | break; | |
221 | case UBIDI_KEEP_BASE_COMBINING: | |
222 | /* | |
223 | * Here, too, the destination | |
224 | * run will have the same length as the source run, | |
225 | * and there is no mirroring. | |
226 | * We do need to keep combining characters with their base characters. | |
227 | */ | |
228 | if(destSize<srcLength) { | |
229 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
230 | return srcLength; | |
231 | } | |
232 | destSize=srcLength; | |
233 | ||
234 | /* preserve character integrity */ | |
235 | do { | |
236 | /* i is always after the last code unit known to need to be kept in this segment */ | |
237 | i=srcLength; | |
238 | ||
239 | /* collect code units and modifier letters for one base character */ | |
240 | do { | |
241 | UTF_PREV_CHAR(src, 0, srcLength, c); | |
242 | } while(srcLength>0 && IS_COMBINING(u_charType(c))); | |
243 | ||
244 | /* copy this "user character" */ | |
245 | j=srcLength; | |
246 | do { | |
247 | *dest++=src[j++]; | |
248 | } while(j<i); | |
249 | } while(srcLength>0); | |
250 | break; | |
251 | default: | |
252 | /* | |
253 | * With several "complicated" options set, this is the most | |
254 | * general and the slowest copying of an RTL run. | |
255 | * We will do mirroring, remove BiDi controls, and | |
256 | * keep combining characters with their base characters | |
257 | * as requested. | |
258 | */ | |
259 | if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) { | |
260 | i=srcLength; | |
261 | } else { | |
262 | /* we need to find out the destination length of the run, | |
263 | which will not include the BiDi control characters */ | |
264 | int32_t length=srcLength; | |
265 | UChar ch; | |
266 | ||
267 | i=0; | |
268 | do { | |
269 | ch=*src++; | |
270 | if(!IS_BIDI_CONTROL_CHAR(ch)) { | |
271 | ++i; | |
272 | } | |
273 | } while(--length>0); | |
274 | src-=srcLength; | |
275 | } | |
276 | ||
277 | if(destSize<i) { | |
278 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
279 | return i; | |
280 | } | |
281 | destSize=i; | |
282 | ||
283 | /* preserve character integrity */ | |
284 | do { | |
285 | /* i is always after the last code unit known to need to be kept in this segment */ | |
286 | i=srcLength; | |
287 | ||
288 | /* collect code units for one base character */ | |
289 | UTF_PREV_CHAR(src, 0, srcLength, c); | |
290 | if(options&UBIDI_KEEP_BASE_COMBINING) { | |
291 | /* collect modifier letters for this base character */ | |
292 | while(srcLength>0 && IS_COMBINING(u_charType(c))) { | |
293 | UTF_PREV_CHAR(src, 0, srcLength, c); | |
294 | } | |
295 | } | |
296 | ||
297 | if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) { | |
298 | /* do not copy this BiDi control character */ | |
299 | continue; | |
300 | } | |
301 | ||
302 | /* copy this "user character" */ | |
303 | j=srcLength; | |
304 | if(options&UBIDI_DO_MIRRORING) { | |
305 | /* mirror only the base character */ | |
306 | int32_t k=0; | |
307 | c=u_charMirror(c); | |
308 | UTF_APPEND_CHAR_UNSAFE(dest, k, c); | |
309 | dest+=k; | |
310 | j+=k; | |
311 | } | |
312 | while(j<i) { | |
313 | *dest++=src[j++]; | |
314 | } | |
315 | } while(srcLength>0); | |
316 | break; | |
317 | } /* end of switch */ | |
318 | ||
319 | return destSize; | |
320 | } | |
321 | ||
322 | U_CAPI int32_t U_EXPORT2 | |
323 | ubidi_writeReverse(const UChar *src, int32_t srcLength, | |
324 | UChar *dest, int32_t destSize, | |
325 | uint16_t options, | |
326 | UErrorCode *pErrorCode) { | |
327 | int32_t destLength; | |
328 | ||
329 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
330 | return 0; | |
331 | } | |
332 | ||
333 | /* more error checking */ | |
334 | if( src==NULL || srcLength<-1 || | |
335 | destSize<0 || (destSize>0 && dest==NULL)) | |
336 | { | |
337 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
338 | return 0; | |
339 | } | |
340 | ||
341 | /* do input and output overlap? */ | |
342 | if( dest!=NULL && | |
343 | ((src>=dest && src<dest+destSize) || | |
344 | (dest>=src && dest<src+srcLength))) | |
345 | { | |
346 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
347 | return 0; | |
348 | } | |
349 | ||
350 | if(srcLength==-1) { | |
351 | srcLength=u_strlen(src); | |
352 | } | |
353 | if(srcLength>0) { | |
354 | destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode); | |
355 | } else { | |
356 | /* nothing to do */ | |
357 | destLength=0; | |
358 | } | |
359 | ||
360 | return u_terminateUChars(dest, destSize, destLength, pErrorCode); | |
361 | } | |
362 | ||
363 | #define MASK_R_AL (1UL<<U_RIGHT_TO_LEFT|1UL<<U_RIGHT_TO_LEFT_ARABIC) | |
364 | ||
365 | U_CAPI int32_t U_EXPORT2 | |
366 | ubidi_writeReordered(UBiDi *pBiDi, | |
367 | UChar *dest, int32_t destSize, | |
368 | uint16_t options, | |
369 | UErrorCode *pErrorCode) { | |
370 | const UChar *text; | |
371 | UChar *saveDest; | |
372 | int32_t length, destCapacity; | |
373 | int32_t run, runCount, logicalStart, runLength; | |
374 | ||
375 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
376 | return 0; | |
377 | } | |
378 | ||
379 | /* more error checking */ | |
380 | if( pBiDi==NULL || | |
381 | (text=ubidi_getText(pBiDi))==NULL || (length=ubidi_getLength(pBiDi))<0 || | |
382 | destSize<0 || (destSize>0 && dest==NULL)) | |
383 | { | |
384 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
385 | return 0; | |
386 | } | |
387 | ||
388 | /* do input and output overlap? */ | |
389 | if( dest!=NULL && | |
390 | ((text>=dest && text<dest+destSize) || | |
391 | (dest>=text && dest<text+length))) | |
392 | { | |
393 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
394 | return 0; | |
395 | } | |
396 | ||
397 | if(length==0) { | |
398 | /* nothing to do */ | |
399 | return u_terminateUChars(dest, destSize, 0, pErrorCode); | |
400 | } | |
401 | ||
402 | runCount=ubidi_countRuns(pBiDi, pErrorCode); | |
403 | if(U_FAILURE(*pErrorCode)) { | |
404 | return 0; | |
405 | } | |
406 | ||
407 | /* destSize shrinks, later destination length=destCapacity-destSize */ | |
408 | saveDest=dest; | |
409 | destCapacity=destSize; | |
410 | ||
411 | /* | |
412 | * If we do not perform the "inverse BiDi" algorithm, then we | |
413 | * don't need to insert any LRMs, and don't need to test for it. | |
414 | */ | |
415 | if(!ubidi_isInverse(pBiDi)) { | |
416 | options&=~UBIDI_INSERT_LRM_FOR_NUMERIC; | |
417 | } | |
418 | ||
419 | /* | |
420 | * Iterate through all visual runs and copy the run text segments to | |
421 | * the destination, according to the options. | |
422 | * | |
423 | * The tests for where to insert LRMs ignore the fact that there may be | |
424 | * BN codes or non-BMP code points at the beginning and end of a run; | |
425 | * they may insert LRMs unnecessarily but the tests are faster this way | |
426 | * (this would have to be improved for UTF-8). | |
427 | * | |
428 | * Note that the only errors that are set by doWriteXY() are buffer overflow | |
429 | * errors. Ignore them until the end, and continue for preflighting. | |
430 | */ | |
431 | if(!(options&UBIDI_OUTPUT_REVERSE)) { | |
432 | /* forward output */ | |
433 | if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) { | |
434 | /* do not insert BiDi controls */ | |
435 | for(run=0; run<runCount; ++run) { | |
436 | if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) { | |
437 | runLength=doWriteForward(text+logicalStart, runLength, | |
438 | dest, destSize, | |
439 | (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); | |
440 | } else { | |
441 | runLength=doWriteReverse(text+logicalStart, runLength, | |
442 | dest, destSize, | |
443 | options, pErrorCode); | |
444 | } | |
445 | dest+=runLength; | |
446 | destSize-=runLength; | |
447 | } | |
448 | } else { | |
449 | /* insert BiDi controls for "inverse BiDi" */ | |
450 | const UChar *src; | |
451 | UBiDiDirection dir; | |
452 | ||
453 | for(run=0; run<runCount; ++run) { | |
454 | dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength); | |
455 | src=text+logicalStart; | |
456 | ||
457 | if(UBIDI_LTR==dir) { | |
458 | if(/*run>0 &&*/ u_charDirection(*src)!=U_LEFT_TO_RIGHT) { | |
459 | if(destSize>0) { | |
460 | *dest++=LRM_CHAR; | |
461 | } | |
462 | --destSize; | |
463 | } | |
464 | ||
465 | runLength=doWriteForward(src, runLength, | |
466 | dest, destSize, | |
467 | (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); | |
468 | dest+=runLength; | |
469 | destSize-=runLength; | |
470 | ||
471 | if(/*run<runCount-1 &&*/ u_charDirection(src[runLength-1])!=U_LEFT_TO_RIGHT) { | |
472 | if(destSize>0) { | |
473 | *dest++=LRM_CHAR; | |
474 | } | |
475 | --destSize; | |
476 | } | |
477 | } else { | |
478 | if(/*run>0 &&*/ !(MASK_R_AL&1UL<<u_charDirection(src[runLength-1]))) { | |
479 | if(destSize>0) { | |
480 | *dest++=RLM_CHAR; | |
481 | } | |
482 | --destSize; | |
483 | } | |
484 | ||
485 | runLength=doWriteReverse(src, runLength, | |
486 | dest, destSize, | |
487 | options, pErrorCode); | |
488 | dest+=runLength; | |
489 | destSize-=runLength; | |
490 | ||
491 | if(/*run<runCount-1 &&*/ !(MASK_R_AL&1UL<<u_charDirection(*src))) { | |
492 | if(destSize>0) { | |
493 | *dest++=RLM_CHAR; | |
494 | } | |
495 | --destSize; | |
496 | } | |
497 | } | |
498 | } | |
499 | } | |
500 | } else { | |
501 | /* reverse output */ | |
502 | if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) { | |
503 | /* do not insert BiDi controls */ | |
504 | for(run=runCount; --run>=0;) { | |
505 | if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) { | |
506 | runLength=doWriteReverse(text+logicalStart, runLength, | |
507 | dest, destSize, | |
508 | (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); | |
509 | } else { | |
510 | runLength=doWriteForward(text+logicalStart, runLength, | |
511 | dest, destSize, | |
512 | options, pErrorCode); | |
513 | } | |
514 | dest+=runLength; | |
515 | destSize-=runLength; | |
516 | } | |
517 | } else { | |
518 | /* insert BiDi controls for "inverse BiDi" */ | |
519 | const UChar *src; | |
520 | UBiDiDirection dir; | |
521 | ||
522 | for(run=runCount; --run>=0;) { | |
523 | /* reverse output */ | |
524 | dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength); | |
525 | src=text+logicalStart; | |
526 | ||
527 | if(UBIDI_LTR==dir) { | |
528 | if(/*run<runCount-1 &&*/ u_charDirection(src[runLength-1])!=U_LEFT_TO_RIGHT) { | |
529 | if(destSize>0) { | |
530 | *dest++=LRM_CHAR; | |
531 | } | |
532 | --destSize; | |
533 | } | |
534 | ||
535 | runLength=doWriteReverse(src, runLength, | |
536 | dest, destSize, | |
537 | (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); | |
538 | dest+=runLength; | |
539 | destSize-=runLength; | |
540 | ||
541 | if(/*run>0 &&*/ u_charDirection(*src)!=U_LEFT_TO_RIGHT) { | |
542 | if(destSize>0) { | |
543 | *dest++=LRM_CHAR; | |
544 | } | |
545 | --destSize; | |
546 | } | |
547 | } else { | |
548 | if(/*run<runCount-1 &&*/ !(MASK_R_AL&1UL<<u_charDirection(*src))) { | |
549 | if(destSize>0) { | |
550 | *dest++=RLM_CHAR; | |
551 | } | |
552 | --destSize; | |
553 | } | |
554 | ||
555 | runLength=doWriteForward(src, runLength, | |
556 | dest, destSize, | |
557 | options, pErrorCode); | |
558 | dest+=runLength; | |
559 | destSize-=runLength; | |
560 | ||
561 | if(/*run>0 &&*/ !(MASK_R_AL&1UL<<u_charDirection(src[runLength-1]))) { | |
562 | if(destSize>0) { | |
563 | *dest++=RLM_CHAR; | |
564 | } | |
565 | --destSize; | |
566 | } | |
567 | } | |
568 | } | |
569 | } | |
570 | } | |
571 | ||
572 | return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode); | |
573 | } |