2 ******************************************************************************
4 * Copyright (C) 2000-2001, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ubidiwrt.c
10 * tab size: 8 (not used)
13 * created on: 1999aug06
14 * created by: Markus W. Scherer
16 * This file contains implementations for BiDi functions that use
17 * the core algorithm and core API to write reordered text.
20 /* set import/export definitions */
21 #ifndef U_COMMON_IMPLEMENTATION
22 # define U_COMMON_IMPLEMENTATION
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ubidi.h"
34 * The function implementations in this file are designed
35 * for UTF-16 and UTF-32, not for UTF-8.
37 * Assumptions that are not true for UTF-8:
38 * - Any code point always needs the same number of code units
39 * ("minimum-length-problem" of UTF-8)
40 * - The BiDi control characters need only one code unit each
42 * Further assumptions for all UTFs:
43 * - u_charMirror(c) needs the same number of code units as c
46 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
49 /** BiDi control code points */
60 #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffe)==LRM_CHAR || (uint32_t)((c)-LRE_CHAR)<5)
61 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
64 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
65 * semantically write RTL runs in reverse and later reverse them again.
66 * Instead, we actually write them in forward order to begin with.
67 * However, if the RTL run was to be mirrored, we need to mirror here now
68 * since the implicit second reversal must not do it.
69 * It looks strange to do mirroring in LTR output, but it is only because
70 * we are writing RTL output in reverse.
73 doWriteForward(const UChar
*src
, int32_t srcLength
,
74 UChar
*dest
, int32_t destSize
,
76 UErrorCode
*pErrorCode
) {
77 /* optimize for several combinations of options */
78 switch(options
&(UBIDI_REMOVE_BIDI_CONTROLS
|UBIDI_DO_MIRRORING
)) {
80 /* simply copy the LTR run to the destination */
81 int32_t length
=srcLength
;
83 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
91 case UBIDI_DO_MIRRORING
: {
96 if(destSize
<srcLength
) {
97 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
101 UTF_NEXT_CHAR(src
, i
, srcLength
, c
);
103 UTF_APPEND_CHAR_UNSAFE(dest
, j
, c
);
104 } while(i
<srcLength
);
107 case UBIDI_REMOVE_BIDI_CONTROLS
: {
108 /* copy the LTR run and remove any BiDi control characters */
109 int32_t remaining
=destSize
;
113 if(!IS_BIDI_CONTROL_CHAR(c
)) {
115 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
117 /* preflight the length */
118 while(--srcLength
>0) {
120 if(!IS_BIDI_CONTROL_CHAR(c
)) {
124 return destSize
-remaining
;
128 } while(--srcLength
>0);
129 return destSize
-remaining
;
132 /* remove BiDi control characters and do mirroring */
133 int32_t remaining
=destSize
;
138 UTF_NEXT_CHAR(src
, i
, srcLength
, c
);
141 if(!IS_BIDI_CONTROL_CHAR(c
)) {
144 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
146 /* preflight the length */
149 if(!IS_BIDI_CONTROL_CHAR(c
)) {
154 return destSize
-remaining
;
157 UTF_APPEND_CHAR_UNSAFE(dest
, j
, c
);
159 } while(srcLength
>0);
162 } /* end of switch */
166 doWriteReverse(const UChar
*src
, int32_t srcLength
,
167 UChar
*dest
, int32_t destSize
,
169 UErrorCode
*pErrorCode
) {
173 * RTL runs need to be copied to the destination in reverse order
174 * of code points, not code units, to keep Unicode characters intact.
176 * The general strategy for this is to read the source text
177 * in backward order, collect all code units for a code point
178 * (and optionally following combining characters, see below),
179 * and copy all these code units in ascending order
180 * to the destination for this run.
182 * Several options request whether combining characters
183 * should be kept after their base characters,
184 * whether BiDi control characters should be removed, and
185 * whether characters should be replaced by their mirror-image
186 * equivalent Unicode characters.
191 /* optimize for several combinations of options */
192 switch(options
&(UBIDI_REMOVE_BIDI_CONTROLS
|UBIDI_DO_MIRRORING
|UBIDI_KEEP_BASE_COMBINING
)) {
195 * With none of the "complicated" options set, the destination
196 * run will have the same length as the source run,
197 * and there is no mirroring and no keeping combining characters
198 * with their base characters.
200 if(destSize
<srcLength
) {
201 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
206 /* preserve character integrity */
208 /* i is always after the last code unit known to need to be kept in this segment */
211 /* collect code units for one base character */
212 UTF_BACK_1(src
, 0, srcLength
);
214 /* copy this base character */
219 } while(srcLength
>0);
221 case UBIDI_KEEP_BASE_COMBINING
:
223 * Here, too, the destination
224 * run will have the same length as the source run,
225 * and there is no mirroring.
226 * We do need to keep combining characters with their base characters.
228 if(destSize
<srcLength
) {
229 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
234 /* preserve character integrity */
236 /* i is always after the last code unit known to need to be kept in this segment */
239 /* collect code units and modifier letters for one base character */
241 UTF_PREV_CHAR(src
, 0, srcLength
, c
);
242 } while(srcLength
>0 && IS_COMBINING(u_charType(c
)));
244 /* copy this "user character" */
249 } while(srcLength
>0);
253 * With several "complicated" options set, this is the most
254 * general and the slowest copying of an RTL run.
255 * We will do mirroring, remove BiDi controls, and
256 * keep combining characters with their base characters
259 if(!(options
&UBIDI_REMOVE_BIDI_CONTROLS
)) {
262 /* we need to find out the destination length of the run,
263 which will not include the BiDi control characters */
264 int32_t length
=srcLength
;
270 if(!IS_BIDI_CONTROL_CHAR(ch
)) {
278 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
283 /* preserve character integrity */
285 /* i is always after the last code unit known to need to be kept in this segment */
288 /* collect code units for one base character */
289 UTF_PREV_CHAR(src
, 0, srcLength
, c
);
290 if(options
&UBIDI_KEEP_BASE_COMBINING
) {
291 /* collect modifier letters for this base character */
292 while(srcLength
>0 && IS_COMBINING(u_charType(c
))) {
293 UTF_PREV_CHAR(src
, 0, srcLength
, c
);
297 if(options
&UBIDI_REMOVE_BIDI_CONTROLS
&& IS_BIDI_CONTROL_CHAR(c
)) {
298 /* do not copy this BiDi control character */
302 /* copy this "user character" */
304 if(options
&UBIDI_DO_MIRRORING
) {
305 /* mirror only the base character */
308 UTF_APPEND_CHAR_UNSAFE(dest
, k
, c
);
315 } while(srcLength
>0);
317 } /* end of switch */
322 U_CAPI
int32_t U_EXPORT2
323 ubidi_writeReverse(const UChar
*src
, int32_t srcLength
,
324 UChar
*dest
, int32_t destSize
,
326 UErrorCode
*pErrorCode
) {
329 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
333 /* more error checking */
334 if( src
==NULL
|| srcLength
<-1 ||
335 destSize
<0 || (destSize
>0 && dest
==NULL
))
337 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
341 /* do input and output overlap? */
343 ((src
>=dest
&& src
<dest
+destSize
) ||
344 (dest
>=src
&& dest
<src
+srcLength
)))
346 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
351 srcLength
=u_strlen(src
);
354 destLength
=doWriteReverse(src
, srcLength
, dest
, destSize
, options
, pErrorCode
);
360 return u_terminateUChars(dest
, destSize
, destLength
, pErrorCode
);
363 #define MASK_R_AL (1UL<<U_RIGHT_TO_LEFT|1UL<<U_RIGHT_TO_LEFT_ARABIC)
365 U_CAPI
int32_t U_EXPORT2
366 ubidi_writeReordered(UBiDi
*pBiDi
,
367 UChar
*dest
, int32_t destSize
,
369 UErrorCode
*pErrorCode
) {
372 int32_t length
, destCapacity
;
373 int32_t run
, runCount
, logicalStart
, runLength
;
375 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
379 /* more error checking */
381 (text
=ubidi_getText(pBiDi
))==NULL
|| (length
=ubidi_getLength(pBiDi
))<0 ||
382 destSize
<0 || (destSize
>0 && dest
==NULL
))
384 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
388 /* do input and output overlap? */
390 ((text
>=dest
&& text
<dest
+destSize
) ||
391 (dest
>=text
&& dest
<text
+length
)))
393 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
399 return u_terminateUChars(dest
, destSize
, 0, pErrorCode
);
402 runCount
=ubidi_countRuns(pBiDi
, pErrorCode
);
403 if(U_FAILURE(*pErrorCode
)) {
407 /* destSize shrinks, later destination length=destCapacity-destSize */
409 destCapacity
=destSize
;
412 * If we do not perform the "inverse BiDi" algorithm, then we
413 * don't need to insert any LRMs, and don't need to test for it.
415 if(!ubidi_isInverse(pBiDi
)) {
416 options
&=~UBIDI_INSERT_LRM_FOR_NUMERIC
;
420 * Iterate through all visual runs and copy the run text segments to
421 * the destination, according to the options.
423 * The tests for where to insert LRMs ignore the fact that there may be
424 * BN codes or non-BMP code points at the beginning and end of a run;
425 * they may insert LRMs unnecessarily but the tests are faster this way
426 * (this would have to be improved for UTF-8).
428 * Note that the only errors that are set by doWriteXY() are buffer overflow
429 * errors. Ignore them until the end, and continue for preflighting.
431 if(!(options
&UBIDI_OUTPUT_REVERSE
)) {
433 if(!(options
&UBIDI_INSERT_LRM_FOR_NUMERIC
)) {
434 /* do not insert BiDi controls */
435 for(run
=0; run
<runCount
; ++run
) {
436 if(UBIDI_LTR
==ubidi_getVisualRun(pBiDi
, run
, &logicalStart
, &runLength
)) {
437 runLength
=doWriteForward(text
+logicalStart
, runLength
,
439 (uint16_t)(options
&~UBIDI_DO_MIRRORING
), pErrorCode
);
441 runLength
=doWriteReverse(text
+logicalStart
, runLength
,
443 options
, pErrorCode
);
449 /* insert BiDi controls for "inverse BiDi" */
453 for(run
=0; run
<runCount
; ++run
) {
454 dir
=ubidi_getVisualRun(pBiDi
, run
, &logicalStart
, &runLength
);
455 src
=text
+logicalStart
;
458 if(/*run>0 &&*/ u_charDirection(*src
)!=U_LEFT_TO_RIGHT
) {
465 runLength
=doWriteForward(src
, runLength
,
467 (uint16_t)(options
&~UBIDI_DO_MIRRORING
), pErrorCode
);
471 if(/*run<runCount-1 &&*/ u_charDirection(src
[runLength
-1])!=U_LEFT_TO_RIGHT
) {
478 if(/*run>0 &&*/ !(MASK_R_AL
&1UL<<u_charDirection(src
[runLength
-1]))) {
485 runLength
=doWriteReverse(src
, runLength
,
487 options
, pErrorCode
);
491 if(/*run<runCount-1 &&*/ !(MASK_R_AL
&1UL<<u_charDirection(*src
))) {
502 if(!(options
&UBIDI_INSERT_LRM_FOR_NUMERIC
)) {
503 /* do not insert BiDi controls */
504 for(run
=runCount
; --run
>=0;) {
505 if(UBIDI_LTR
==ubidi_getVisualRun(pBiDi
, run
, &logicalStart
, &runLength
)) {
506 runLength
=doWriteReverse(text
+logicalStart
, runLength
,
508 (uint16_t)(options
&~UBIDI_DO_MIRRORING
), pErrorCode
);
510 runLength
=doWriteForward(text
+logicalStart
, runLength
,
512 options
, pErrorCode
);
518 /* insert BiDi controls for "inverse BiDi" */
522 for(run
=runCount
; --run
>=0;) {
524 dir
=ubidi_getVisualRun(pBiDi
, run
, &logicalStart
, &runLength
);
525 src
=text
+logicalStart
;
528 if(/*run<runCount-1 &&*/ u_charDirection(src
[runLength
-1])!=U_LEFT_TO_RIGHT
) {
535 runLength
=doWriteReverse(src
, runLength
,
537 (uint16_t)(options
&~UBIDI_DO_MIRRORING
), pErrorCode
);
541 if(/*run>0 &&*/ u_charDirection(*src
)!=U_LEFT_TO_RIGHT
) {
548 if(/*run<runCount-1 &&*/ !(MASK_R_AL
&1UL<<u_charDirection(*src
))) {
555 runLength
=doWriteForward(src
, runLength
,
557 options
, pErrorCode
);
561 if(/*run>0 &&*/ !(MASK_R_AL
&1UL<<u_charDirection(src
[runLength
-1]))) {
572 return u_terminateUChars(saveDest
, destCapacity
, destCapacity
-destSize
, pErrorCode
);