]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ubidiwrt.c
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / ubidiwrt.c
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2000-2001, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ubidiwrt.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 1999aug06
14 * created by: Markus W. Scherer
15 *
16 * This file contains implementations for BiDi functions that use
17 * the core algorithm and core API to write reordered text.
18 */
19
20 /* set import/export definitions */
21 #ifndef U_COMMON_IMPLEMENTATION
22 # define U_COMMON_IMPLEMENTATION
23 #endif
24
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ubidi.h"
29 #include "cmemory.h"
30 #include "ustr_imp.h"
31 #include "ubidiimp.h"
32
33 /*
34 * The function implementations in this file are designed
35 * for UTF-16 and UTF-32, not for UTF-8.
36 *
37 * Assumptions that are not true for UTF-8:
38 * - Any code point always needs the same number of code units
39 * ("minimum-length-problem" of UTF-8)
40 * - The BiDi control characters need only one code unit each
41 *
42 * Further assumptions for all UTFs:
43 * - u_charMirror(c) needs the same number of code units as c
44 */
45 #if UTF_SIZE==8
46 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
47 #endif
48
49 /** BiDi control code points */
50 enum {
51 LRM_CHAR=0x200e,
52 RLM_CHAR,
53 LRE_CHAR=0x202a,
54 RLE_CHAR,
55 PDF_CHAR,
56 LRO_CHAR,
57 RLO_CHAR
58 };
59
60 #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffe)==LRM_CHAR || (uint32_t)((c)-LRE_CHAR)<5)
61 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
62
63 /*
64 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
65 * semantically write RTL runs in reverse and later reverse them again.
66 * Instead, we actually write them in forward order to begin with.
67 * However, if the RTL run was to be mirrored, we need to mirror here now
68 * since the implicit second reversal must not do it.
69 * It looks strange to do mirroring in LTR output, but it is only because
70 * we are writing RTL output in reverse.
71 */
72 static int32_t
73 doWriteForward(const UChar *src, int32_t srcLength,
74 UChar *dest, int32_t destSize,
75 uint16_t options,
76 UErrorCode *pErrorCode) {
77 /* optimize for several combinations of options */
78 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
79 case 0: {
80 /* simply copy the LTR run to the destination */
81 int32_t length=srcLength;
82 if(destSize<length) {
83 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
84 return srcLength;
85 }
86 do {
87 *dest++=*src++;
88 } while(--length>0);
89 return srcLength;
90 }
91 case UBIDI_DO_MIRRORING: {
92 /* do mirroring */
93 int32_t i=0, j=0;
94 UChar32 c;
95
96 if(destSize<srcLength) {
97 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
98 return srcLength;
99 }
100 do {
101 UTF_NEXT_CHAR(src, i, srcLength, c);
102 c=u_charMirror(c);
103 UTF_APPEND_CHAR_UNSAFE(dest, j, c);
104 } while(i<srcLength);
105 return srcLength;
106 }
107 case UBIDI_REMOVE_BIDI_CONTROLS: {
108 /* copy the LTR run and remove any BiDi control characters */
109 int32_t remaining=destSize;
110 UChar c;
111 do {
112 c=*src++;
113 if(!IS_BIDI_CONTROL_CHAR(c)) {
114 if(--remaining<0) {
115 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
116
117 /* preflight the length */
118 while(--srcLength>0) {
119 c=*src++;
120 if(!IS_BIDI_CONTROL_CHAR(c)) {
121 --remaining;
122 }
123 }
124 return destSize-remaining;
125 }
126 *dest++=c;
127 }
128 } while(--srcLength>0);
129 return destSize-remaining;
130 }
131 default: {
132 /* remove BiDi control characters and do mirroring */
133 int32_t remaining=destSize;
134 int32_t i, j=0;
135 UChar32 c;
136 do {
137 i=0;
138 UTF_NEXT_CHAR(src, i, srcLength, c);
139 src+=i;
140 srcLength-=i;
141 if(!IS_BIDI_CONTROL_CHAR(c)) {
142 remaining-=i;
143 if(remaining<0) {
144 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
145
146 /* preflight the length */
147 while(srcLength>0) {
148 c=*src++;
149 if(!IS_BIDI_CONTROL_CHAR(c)) {
150 --remaining;
151 }
152 --srcLength;
153 }
154 return destSize-remaining;
155 }
156 c=u_charMirror(c);
157 UTF_APPEND_CHAR_UNSAFE(dest, j, c);
158 }
159 } while(srcLength>0);
160 return j;
161 }
162 } /* end of switch */
163 }
164
165 static int32_t
166 doWriteReverse(const UChar *src, int32_t srcLength,
167 UChar *dest, int32_t destSize,
168 uint16_t options,
169 UErrorCode *pErrorCode) {
170 /*
171 * RTL run -
172 *
173 * RTL runs need to be copied to the destination in reverse order
174 * of code points, not code units, to keep Unicode characters intact.
175 *
176 * The general strategy for this is to read the source text
177 * in backward order, collect all code units for a code point
178 * (and optionally following combining characters, see below),
179 * and copy all these code units in ascending order
180 * to the destination for this run.
181 *
182 * Several options request whether combining characters
183 * should be kept after their base characters,
184 * whether BiDi control characters should be removed, and
185 * whether characters should be replaced by their mirror-image
186 * equivalent Unicode characters.
187 */
188 int32_t i, j;
189 UChar32 c;
190
191 /* optimize for several combinations of options */
192 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
193 case 0:
194 /*
195 * With none of the "complicated" options set, the destination
196 * run will have the same length as the source run,
197 * and there is no mirroring and no keeping combining characters
198 * with their base characters.
199 */
200 if(destSize<srcLength) {
201 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
202 return srcLength;
203 }
204 destSize=srcLength;
205
206 /* preserve character integrity */
207 do {
208 /* i is always after the last code unit known to need to be kept in this segment */
209 i=srcLength;
210
211 /* collect code units for one base character */
212 UTF_BACK_1(src, 0, srcLength);
213
214 /* copy this base character */
215 j=srcLength;
216 do {
217 *dest++=src[j++];
218 } while(j<i);
219 } while(srcLength>0);
220 break;
221 case UBIDI_KEEP_BASE_COMBINING:
222 /*
223 * Here, too, the destination
224 * run will have the same length as the source run,
225 * and there is no mirroring.
226 * We do need to keep combining characters with their base characters.
227 */
228 if(destSize<srcLength) {
229 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
230 return srcLength;
231 }
232 destSize=srcLength;
233
234 /* preserve character integrity */
235 do {
236 /* i is always after the last code unit known to need to be kept in this segment */
237 i=srcLength;
238
239 /* collect code units and modifier letters for one base character */
240 do {
241 UTF_PREV_CHAR(src, 0, srcLength, c);
242 } while(srcLength>0 && IS_COMBINING(u_charType(c)));
243
244 /* copy this "user character" */
245 j=srcLength;
246 do {
247 *dest++=src[j++];
248 } while(j<i);
249 } while(srcLength>0);
250 break;
251 default:
252 /*
253 * With several "complicated" options set, this is the most
254 * general and the slowest copying of an RTL run.
255 * We will do mirroring, remove BiDi controls, and
256 * keep combining characters with their base characters
257 * as requested.
258 */
259 if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
260 i=srcLength;
261 } else {
262 /* we need to find out the destination length of the run,
263 which will not include the BiDi control characters */
264 int32_t length=srcLength;
265 UChar ch;
266
267 i=0;
268 do {
269 ch=*src++;
270 if(!IS_BIDI_CONTROL_CHAR(ch)) {
271 ++i;
272 }
273 } while(--length>0);
274 src-=srcLength;
275 }
276
277 if(destSize<i) {
278 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
279 return i;
280 }
281 destSize=i;
282
283 /* preserve character integrity */
284 do {
285 /* i is always after the last code unit known to need to be kept in this segment */
286 i=srcLength;
287
288 /* collect code units for one base character */
289 UTF_PREV_CHAR(src, 0, srcLength, c);
290 if(options&UBIDI_KEEP_BASE_COMBINING) {
291 /* collect modifier letters for this base character */
292 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
293 UTF_PREV_CHAR(src, 0, srcLength, c);
294 }
295 }
296
297 if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
298 /* do not copy this BiDi control character */
299 continue;
300 }
301
302 /* copy this "user character" */
303 j=srcLength;
304 if(options&UBIDI_DO_MIRRORING) {
305 /* mirror only the base character */
306 int32_t k=0;
307 c=u_charMirror(c);
308 UTF_APPEND_CHAR_UNSAFE(dest, k, c);
309 dest+=k;
310 j+=k;
311 }
312 while(j<i) {
313 *dest++=src[j++];
314 }
315 } while(srcLength>0);
316 break;
317 } /* end of switch */
318
319 return destSize;
320 }
321
322 U_CAPI int32_t U_EXPORT2
323 ubidi_writeReverse(const UChar *src, int32_t srcLength,
324 UChar *dest, int32_t destSize,
325 uint16_t options,
326 UErrorCode *pErrorCode) {
327 int32_t destLength;
328
329 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
330 return 0;
331 }
332
333 /* more error checking */
334 if( src==NULL || srcLength<-1 ||
335 destSize<0 || (destSize>0 && dest==NULL))
336 {
337 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
338 return 0;
339 }
340
341 /* do input and output overlap? */
342 if( dest!=NULL &&
343 ((src>=dest && src<dest+destSize) ||
344 (dest>=src && dest<src+srcLength)))
345 {
346 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
347 return 0;
348 }
349
350 if(srcLength==-1) {
351 srcLength=u_strlen(src);
352 }
353 if(srcLength>0) {
354 destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
355 } else {
356 /* nothing to do */
357 destLength=0;
358 }
359
360 return u_terminateUChars(dest, destSize, destLength, pErrorCode);
361 }
362
363 #define MASK_R_AL (1UL<<U_RIGHT_TO_LEFT|1UL<<U_RIGHT_TO_LEFT_ARABIC)
364
365 U_CAPI int32_t U_EXPORT2
366 ubidi_writeReordered(UBiDi *pBiDi,
367 UChar *dest, int32_t destSize,
368 uint16_t options,
369 UErrorCode *pErrorCode) {
370 const UChar *text;
371 UChar *saveDest;
372 int32_t length, destCapacity;
373 int32_t run, runCount, logicalStart, runLength;
374
375 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
376 return 0;
377 }
378
379 /* more error checking */
380 if( pBiDi==NULL ||
381 (text=ubidi_getText(pBiDi))==NULL || (length=ubidi_getLength(pBiDi))<0 ||
382 destSize<0 || (destSize>0 && dest==NULL))
383 {
384 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
385 return 0;
386 }
387
388 /* do input and output overlap? */
389 if( dest!=NULL &&
390 ((text>=dest && text<dest+destSize) ||
391 (dest>=text && dest<text+length)))
392 {
393 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
394 return 0;
395 }
396
397 if(length==0) {
398 /* nothing to do */
399 return u_terminateUChars(dest, destSize, 0, pErrorCode);
400 }
401
402 runCount=ubidi_countRuns(pBiDi, pErrorCode);
403 if(U_FAILURE(*pErrorCode)) {
404 return 0;
405 }
406
407 /* destSize shrinks, later destination length=destCapacity-destSize */
408 saveDest=dest;
409 destCapacity=destSize;
410
411 /*
412 * If we do not perform the "inverse BiDi" algorithm, then we
413 * don't need to insert any LRMs, and don't need to test for it.
414 */
415 if(!ubidi_isInverse(pBiDi)) {
416 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
417 }
418
419 /*
420 * Iterate through all visual runs and copy the run text segments to
421 * the destination, according to the options.
422 *
423 * The tests for where to insert LRMs ignore the fact that there may be
424 * BN codes or non-BMP code points at the beginning and end of a run;
425 * they may insert LRMs unnecessarily but the tests are faster this way
426 * (this would have to be improved for UTF-8).
427 *
428 * Note that the only errors that are set by doWriteXY() are buffer overflow
429 * errors. Ignore them until the end, and continue for preflighting.
430 */
431 if(!(options&UBIDI_OUTPUT_REVERSE)) {
432 /* forward output */
433 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
434 /* do not insert BiDi controls */
435 for(run=0; run<runCount; ++run) {
436 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
437 runLength=doWriteForward(text+logicalStart, runLength,
438 dest, destSize,
439 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
440 } else {
441 runLength=doWriteReverse(text+logicalStart, runLength,
442 dest, destSize,
443 options, pErrorCode);
444 }
445 dest+=runLength;
446 destSize-=runLength;
447 }
448 } else {
449 /* insert BiDi controls for "inverse BiDi" */
450 const UChar *src;
451 UBiDiDirection dir;
452
453 for(run=0; run<runCount; ++run) {
454 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
455 src=text+logicalStart;
456
457 if(UBIDI_LTR==dir) {
458 if(/*run>0 &&*/ u_charDirection(*src)!=U_LEFT_TO_RIGHT) {
459 if(destSize>0) {
460 *dest++=LRM_CHAR;
461 }
462 --destSize;
463 }
464
465 runLength=doWriteForward(src, runLength,
466 dest, destSize,
467 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
468 dest+=runLength;
469 destSize-=runLength;
470
471 if(/*run<runCount-1 &&*/ u_charDirection(src[runLength-1])!=U_LEFT_TO_RIGHT) {
472 if(destSize>0) {
473 *dest++=LRM_CHAR;
474 }
475 --destSize;
476 }
477 } else {
478 if(/*run>0 &&*/ !(MASK_R_AL&1UL<<u_charDirection(src[runLength-1]))) {
479 if(destSize>0) {
480 *dest++=RLM_CHAR;
481 }
482 --destSize;
483 }
484
485 runLength=doWriteReverse(src, runLength,
486 dest, destSize,
487 options, pErrorCode);
488 dest+=runLength;
489 destSize-=runLength;
490
491 if(/*run<runCount-1 &&*/ !(MASK_R_AL&1UL<<u_charDirection(*src))) {
492 if(destSize>0) {
493 *dest++=RLM_CHAR;
494 }
495 --destSize;
496 }
497 }
498 }
499 }
500 } else {
501 /* reverse output */
502 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
503 /* do not insert BiDi controls */
504 for(run=runCount; --run>=0;) {
505 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
506 runLength=doWriteReverse(text+logicalStart, runLength,
507 dest, destSize,
508 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
509 } else {
510 runLength=doWriteForward(text+logicalStart, runLength,
511 dest, destSize,
512 options, pErrorCode);
513 }
514 dest+=runLength;
515 destSize-=runLength;
516 }
517 } else {
518 /* insert BiDi controls for "inverse BiDi" */
519 const UChar *src;
520 UBiDiDirection dir;
521
522 for(run=runCount; --run>=0;) {
523 /* reverse output */
524 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
525 src=text+logicalStart;
526
527 if(UBIDI_LTR==dir) {
528 if(/*run<runCount-1 &&*/ u_charDirection(src[runLength-1])!=U_LEFT_TO_RIGHT) {
529 if(destSize>0) {
530 *dest++=LRM_CHAR;
531 }
532 --destSize;
533 }
534
535 runLength=doWriteReverse(src, runLength,
536 dest, destSize,
537 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
538 dest+=runLength;
539 destSize-=runLength;
540
541 if(/*run>0 &&*/ u_charDirection(*src)!=U_LEFT_TO_RIGHT) {
542 if(destSize>0) {
543 *dest++=LRM_CHAR;
544 }
545 --destSize;
546 }
547 } else {
548 if(/*run<runCount-1 &&*/ !(MASK_R_AL&1UL<<u_charDirection(*src))) {
549 if(destSize>0) {
550 *dest++=RLM_CHAR;
551 }
552 --destSize;
553 }
554
555 runLength=doWriteForward(src, runLength,
556 dest, destSize,
557 options, pErrorCode);
558 dest+=runLength;
559 destSize-=runLength;
560
561 if(/*run>0 &&*/ !(MASK_R_AL&1UL<<u_charDirection(src[runLength-1]))) {
562 if(destSize>0) {
563 *dest++=RLM_CHAR;
564 }
565 --destSize;
566 }
567 }
568 }
569 }
570 }
571
572 return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
573 }