2 ******************************************************************************
4 * Copyright (C) 1999-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999jul27
14 * created by: Markus W. Scherer
17 /* set import/export definitions */
18 #ifndef U_COMMON_IMPLEMENTATION
19 # define U_COMMON_IMPLEMENTATION
23 #include "unicode/utypes.h"
24 #include "unicode/ustring.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ubidi.h"
30 * General implementation notes:
32 * Throughout the implementation, there are comments like (W2) that refer to
33 * rules of the BiDi algorithm in its version 5, in this example to the second
34 * rule of the resolution of weak types.
36 * For handling surrogate pairs, where two UChar's form one "abstract" (or UTF-32)
37 * character according to UTF-16, the second UChar gets the directional property of
38 * the entire character assigned, while the first one gets a BN, a boundary
39 * neutral, type, which is ignored by most of the algorithm according to
40 * rule (X9) and the implementation suggestions of the BiDi algorithm.
42 * Later, adjustWSLevels() will set the level for each BN to that of the
43 * following character (UChar), which results in surrogate pairs getting the
44 * same level on each of their surrogates.
46 * In a UTF-8 implementation, the same thing could be done: the last byte of
47 * a multi-byte sequence would get the "real" property, while all previous
48 * bytes of that sequence would get BN.
50 * It is not possible to assign all those parts of a character the same real
51 * property because this would fail in the resolution of weak types with rules
52 * that look at immediately surrounding types.
54 * As a related topic, this implementation does not remove Boundary Neutral
55 * types from the input, but ignores them whereever this is relevant.
56 * For example, the loop for the resolution of the weak types reads
57 * types until it finds a non-BN.
58 * Also, explicit embedding codes are neither changed into BN nor removed.
59 * They are only treated the same way real BNs are.
60 * As stated before, adjustWSLevels() takes care of them at the end.
61 * For the purpose of conformance, the levels of all these codes
64 * Note that this implementation never modifies the dirProps
65 * after the initial setup.
68 * In this implementation, the resolution of weak types (Wn),
69 * neutrals (Nn), and the assignment of the resolved level (In)
70 * are all done in one single loop, in resolveImplicitLevels().
71 * Changes of dirProp values are done on the fly, without writing
72 * them back to the dirProps array.
75 * This implementation contains code that allows to bypass steps of the
76 * algorithm that are not needed on the specific paragraph
77 * in order to speed up the most common cases considerably,
78 * like text that is entirely LTR, or RTL text without numbers.
80 * Most of this is done by setting a bit for each directional property
81 * in a flags variable and later checking for whether there are
82 * any LTR characters or any RTL characters, or both, whether
83 * there are any explicit embedding codes, etc.
85 * If the (Xn) steps are performed, then the flags are re-evaluated,
86 * because they will then not contain the embedding codes any more
87 * and will be adjusted for override codes, so that subsequently
88 * more bypassing may be possible than what the initial flags suggested.
90 * If the text is not mixed-directional, then the
91 * algorithm steps for the weak type resolution are not performed,
92 * and all levels are set to the paragraph level.
94 * If there are no explicit embedding codes, then the (Xn) steps
97 * If embedding levels are supplied as a parameter, then all
98 * explicit embedding codes are ignored, and the (Xn) steps
101 * White Space types could get the level of the run they belong to,
102 * and are checked with a test of (flags&MASK_EMBEDDING) to
103 * consider if the paragraph direction should be considered in
104 * the flags variable.
106 * If there are no White Space types in the paragraph, then
107 * (L1) is not necessary in adjustWSLevels().
110 /* to avoid some conditional statements, use tiny constant arrays */
111 static const Flags flagLR
[2]={ DIRPROP_FLAG(L
), DIRPROP_FLAG(R
) };
112 static const Flags flagE
[2]={ DIRPROP_FLAG(LRE
), DIRPROP_FLAG(RLE
) };
113 static const Flags flagO
[2]={ DIRPROP_FLAG(LRO
), DIRPROP_FLAG(RLO
) };
115 #define DIRPROP_FLAG_LR(level) flagLR[(level)&1]
116 #define DIRPROP_FLAG_E(level) flagE[(level)&1]
117 #define DIRPROP_FLAG_O(level) flagO[(level)&1]
119 /* UBiDi object management -------------------------------------------------- */
121 U_CAPI UBiDi
* U_EXPORT2
124 UErrorCode errorCode
=U_ZERO_ERROR
;
125 return ubidi_openSized(0, 0, &errorCode
);
128 U_CAPI UBiDi
* U_EXPORT2
129 ubidi_openSized(int32_t maxLength
, int32_t maxRunCount
, UErrorCode
*pErrorCode
) {
132 /* check the argument values */
133 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
135 } else if(maxLength
<0 || maxRunCount
<0) {
136 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
137 return NULL
; /* invalid arguments */
140 /* allocate memory for the object */
141 pBiDi
=(UBiDi
*)uprv_malloc(sizeof(UBiDi
));
143 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
147 /* reset the object, all pointers NULL, all flags FALSE, all sizes 0 */
148 uprv_memset(pBiDi
, 0, sizeof(UBiDi
));
150 /* allocate memory for arrays as requested */
152 if( !getInitialDirPropsMemory(pBiDi
, maxLength
) ||
153 !getInitialLevelsMemory(pBiDi
, maxLength
)
155 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
158 pBiDi
->mayAllocateText
=TRUE
;
163 /* use simpleRuns[] */
164 pBiDi
->runsSize
=sizeof(Run
);
165 } else if(!getInitialRunsMemory(pBiDi
, maxRunCount
)) {
166 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
169 pBiDi
->mayAllocateRuns
=TRUE
;
172 if(U_SUCCESS(*pErrorCode
)) {
181 * We are allowed to allocate memory if memory==NULL or
182 * mayAllocate==TRUE for each array that we need.
183 * We also try to grow and shrink memory as needed if we
186 * Assume sizeNeeded>0.
187 * If *pMemory!=NULL, then assume *pSize>0.
189 * ### this realloc() may unnecessarily copy the old data,
190 * which we know we don't need any more;
191 * is this the best way to do this??
194 ubidi_getMemory(void **pMemory
, int32_t *pSize
, UBool mayAllocate
, int32_t sizeNeeded
) {
195 /* check for existing memory */
197 /* we need to allocate memory */
198 if(mayAllocate
&& (*pMemory
=uprv_malloc(sizeNeeded
))!=NULL
) {
205 /* there is some memory, is it enough or too much? */
206 if(sizeNeeded
>*pSize
&& !mayAllocate
) {
207 /* not enough memory, and we must not allocate */
209 } else if(sizeNeeded
!=*pSize
&& mayAllocate
) {
210 /* we may try to grow or shrink */
213 if((memory
=uprv_realloc(*pMemory
, sizeNeeded
))!=NULL
) {
218 /* we failed to grow */
222 /* we have at least enough memory and must not allocate */
228 U_CAPI
void U_EXPORT2
229 ubidi_close(UBiDi
*pBiDi
) {
231 if(pBiDi
->dirPropsMemory
!=NULL
) {
232 uprv_free(pBiDi
->dirPropsMemory
);
234 if(pBiDi
->levelsMemory
!=NULL
) {
235 uprv_free(pBiDi
->levelsMemory
);
237 if(pBiDi
->runsMemory
!=NULL
) {
238 uprv_free(pBiDi
->runsMemory
);
244 /* set to approximate "inverse BiDi" ---------------------------------------- */
246 U_CAPI
void U_EXPORT2
247 ubidi_setInverse(UBiDi
*pBiDi
, UBool isInverse
) {
249 pBiDi
->isInverse
=isInverse
;
253 U_CAPI UBool U_EXPORT2
254 ubidi_isInverse(UBiDi
*pBiDi
) {
256 return pBiDi
->isInverse
;
262 /* perform (P2)..(P3) ------------------------------------------------------- */
265 * Get the directional properties for the text,
266 * calculate the flags bit-set, and
267 * determine the partagraph level if necessary.
270 getDirProps(UBiDi
*pBiDi
, const UChar
*text
) {
271 DirProp
*dirProps
=pBiDi
->dirPropsMemory
; /* pBiDi->dirProps is const */
273 int32_t i
=0, i0
, i1
, length
=pBiDi
->length
;
274 Flags flags
=0; /* collect all directionalities in the text */
278 if(IS_DEFAULT_LEVEL(pBiDi
->paraLevel
)) {
279 /* determine the paragraph level (P2..P3) */
281 i0
=i
; /* index of first code unit */
282 UTF_NEXT_CHAR(text
, i
, length
, uchar
);
283 i1
=i
-1; /* index of last code unit, gets the directional property */
284 flags
|=DIRPROP_FLAG(dirProps
[i1
]=dirProp
=u_charDirection(uchar
));
285 if(i1
>i0
) { /* set previous code units' properties to BN */
286 flags
|=DIRPROP_FLAG(BN
);
295 } else if(dirProp
==R
|| dirProp
==AL
) {
298 } else if(i
>=length
) {
300 * see comment in ubidi.h:
301 * the DEFAULT_XXX values are designed so that
302 * their bit 0 alone yields the intended default
309 flags
|=DIRPROP_FLAG_LR(pBiDi
->paraLevel
);
312 /* get the rest of the directional properties and the flags bits */
314 i0
=i
; /* index of first code unit */
315 UTF_NEXT_CHAR(text
, i
, length
, uchar
);
316 i1
=i
-1; /* index of last code unit, gets the directional property */
317 flags
|=DIRPROP_FLAG(dirProps
[i1
]=dirProp
=u_charDirection(uchar
));
318 if(i1
>i0
) { /* set previous code units' properties to BN */
319 flags
|=DIRPROP_FLAG(BN
);
325 if(flags
&MASK_EMBEDDING
) {
326 flags
|=DIRPROP_FLAG_LR(pBiDi
->paraLevel
);
332 /* perform (X1)..(X9) ------------------------------------------------------- */
334 /* determine if the text is mixed-directional or single-directional */
335 static UBiDiDirection
336 directionFromFlags(Flags flags
) {
337 /* if the text contains AN and neutrals, then some neutrals may become RTL */
338 if(!(flags
&MASK_RTL
|| ((flags
&DIRPROP_FLAG(AN
)) && (flags
&MASK_POSSIBLE_N
)))) {
340 } else if(!(flags
&MASK_LTR
)) {
348 * Resolve the explicit levels as specified by explicit embedding codes.
349 * Recalculate the flags to have them reflect the real properties
350 * after taking the explicit embeddings into account.
352 * The BiDi algorithm is designed to result in the same behavior whether embedding
353 * levels are externally specified (from "styled text", supposedly the preferred
354 * method) or set by explicit embedding codes (LRx, RLx, PDF) in the plain text.
355 * That is why (X9) instructs to remove all explicit codes (and BN).
356 * However, in a real implementation, this removal of these codes and their index
357 * positions in the plain text is undesirable since it would result in
358 * reallocated, reindexed text.
359 * Instead, this implementation leaves the codes in there and just ignores them
360 * in the subsequent processing.
361 * In order to get the same reordering behavior, positions with a BN or an
362 * explicit embedding code just get the same level assigned as the last "real"
365 * Some implementations, not this one, then overwrite some of these
366 * directionality properties at "real" same-level-run boundaries by
367 * L or R codes so that the resolution of weak types can be performed on the
368 * entire paragraph at once instead of having to parse it once more and
369 * perform that resolution on same-level-runs.
370 * This limits the scope of the implicit rules in effectively
371 * the same way as the run limits.
373 * Instead, this implementation does not modify these codes.
374 * On one hand, the paragraph has to be scanned for same-level-runs, but
375 * on the other hand, this saves another loop to reset these codes,
376 * or saves making and modifying a copy of dirProps[].
379 * Note that (Pn) and (Xn) changed significantly from version 4 of the BiDi algorithm.
382 * Handling the stack of explicit levels (Xn):
384 * With the BiDi stack of explicit levels,
385 * as pushed with each LRE, RLE, LRO, and RLO and popped with each PDF,
386 * the explicit level must never exceed UBIDI_MAX_EXPLICIT_LEVEL==61.
388 * In order to have a correct push-pop semantics even in the case of overflows,
389 * there are two overflow counters:
390 * - countOver60 is incremented with each LRx at level 60
391 * - from level 60, one RLx increases the level to 61
392 * - countOver61 is incremented with each LRx and RLx at level 61
394 * Popping levels with PDF must work in the opposite order so that level 61
395 * is correct at the correct point. Underflows (too many PDFs) must be checked.
397 * This implementation assumes that UBIDI_MAX_EXPLICIT_LEVEL is odd.
399 static UBiDiDirection
400 resolveExplicitLevels(UBiDi
*pBiDi
) {
401 const DirProp
*dirProps
=pBiDi
->dirProps
;
402 UBiDiLevel
*levels
=pBiDi
->levels
;
404 int32_t i
=0, length
=pBiDi
->length
;
405 Flags flags
=pBiDi
->flags
; /* collect all directionalities in the text */
407 UBiDiLevel level
=pBiDi
->paraLevel
;
409 UBiDiDirection direction
;
411 /* determine if the text is mixed-directional or single-directional */
412 direction
=directionFromFlags(flags
);
414 /* we may not need to resolve any explicit levels */
415 if(direction
!=UBIDI_MIXED
) {
416 /* not mixed directionality: levels don't matter - trailingWSStart will be 0 */
417 } else if(!(flags
&MASK_EXPLICIT
) || pBiDi
->isInverse
) {
418 /* mixed, but all characters are at the same embedding level */
419 /* or we are in "inverse BiDi" */
420 /* set all levels to the paragraph level */
421 for(i
=0; i
<length
; ++i
) {
425 /* continue to perform (Xn) */
427 /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */
428 /* both variables may carry the UBIDI_LEVEL_OVERRIDE flag to indicate the override status */
429 UBiDiLevel embeddingLevel
=level
, newLevel
, stackTop
=0;
431 UBiDiLevel stack
[UBIDI_MAX_EXPLICIT_LEVEL
]; /* we never push anything >=UBIDI_MAX_EXPLICIT_LEVEL */
432 uint32_t countOver60
=0, countOver61
=0; /* count overflows of explicit levels */
434 /* recalculate the flags */
437 /* since we assume that this is a single paragraph, we ignore (X8) */
438 for(i
=0; i
<length
; ++i
) {
444 newLevel
=(UBiDiLevel
)((embeddingLevel
+2)&~(UBIDI_LEVEL_OVERRIDE
|1)); /* least greater even level */
445 if(newLevel
<=UBIDI_MAX_EXPLICIT_LEVEL
) {
446 stack
[stackTop
]=embeddingLevel
;
448 embeddingLevel
=newLevel
;
450 embeddingLevel
|=UBIDI_LEVEL_OVERRIDE
;
452 embeddingLevel
&=~UBIDI_LEVEL_OVERRIDE
;
454 } else if((embeddingLevel
&~UBIDI_LEVEL_OVERRIDE
)==UBIDI_MAX_EXPLICIT_LEVEL
) {
456 } else /* (embeddingLevel&~UBIDI_LEVEL_OVERRIDE)==UBIDI_MAX_EXPLICIT_LEVEL-1 */ {
459 flags
|=DIRPROP_FLAG(BN
);
464 newLevel
=(UBiDiLevel
)(((embeddingLevel
&~UBIDI_LEVEL_OVERRIDE
)+1)|1); /* least greater odd level */
465 if(newLevel
<=UBIDI_MAX_EXPLICIT_LEVEL
) {
466 stack
[stackTop
]=embeddingLevel
;
468 embeddingLevel
=newLevel
;
470 embeddingLevel
|=UBIDI_LEVEL_OVERRIDE
;
472 embeddingLevel
&=~UBIDI_LEVEL_OVERRIDE
;
477 flags
|=DIRPROP_FLAG(BN
);
481 /* handle all the overflow cases first */
484 } else if(countOver60
>0 && (embeddingLevel
&~UBIDI_LEVEL_OVERRIDE
)!=UBIDI_MAX_EXPLICIT_LEVEL
) {
485 /* handle LRx overflows from level 60 */
487 } else if(stackTop
>0) {
488 /* this is the pop operation; it also pops level 61 while countOver60>0 */
490 embeddingLevel
=stack
[stackTop
];
491 /* } else { (underflow) */
493 flags
|=DIRPROP_FLAG(BN
);
497 * We do not really expect to see a paragraph separator (B),
498 * but we should do something reasonable with it,
499 * especially at the end of the text.
502 countOver60
=countOver61
=0;
503 embeddingLevel
=level
=pBiDi
->paraLevel
;
504 flags
|=DIRPROP_FLAG(B
);
507 /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */
508 /* they will get their levels set correctly in adjustWSLevels() */
509 flags
|=DIRPROP_FLAG(BN
);
512 /* all other types get the "real" level */
513 if(level
!=embeddingLevel
) {
514 level
=embeddingLevel
;
515 if(level
&UBIDI_LEVEL_OVERRIDE
) {
516 flags
|=DIRPROP_FLAG_O(level
)|DIRPROP_FLAG_MULTI_RUNS
;
518 flags
|=DIRPROP_FLAG_E(level
)|DIRPROP_FLAG_MULTI_RUNS
;
521 if(!(level
&UBIDI_LEVEL_OVERRIDE
)) {
522 flags
|=DIRPROP_FLAG(dirProp
);
528 * We need to set reasonable levels even on BN codes and
529 * explicit codes because we will later look at same-level runs (X10).
533 if(flags
&MASK_EMBEDDING
) {
534 flags
|=DIRPROP_FLAG_LR(pBiDi
->paraLevel
);
537 /* subsequently, ignore the explicit codes and BN (X9) */
539 /* again, determine if the text is mixed-directional or single-directional */
541 direction
=directionFromFlags(flags
);
547 * Use a pre-specified embedding levels array:
549 * Adjust the directional properties for overrides (->LEVEL_OVERRIDE),
550 * ignore all explicit codes (X9),
551 * and check all the preset levels.
553 * Recalculate the flags to have them reflect the real properties
554 * after taking the explicit embeddings into account.
556 static UBiDiDirection
557 checkExplicitLevels(UBiDi
*pBiDi
, UErrorCode
*pErrorCode
) {
558 const DirProp
*dirProps
=pBiDi
->dirProps
;
559 UBiDiLevel
*levels
=pBiDi
->levels
;
561 int32_t i
, length
=pBiDi
->length
;
562 Flags flags
=0; /* collect all directionalities in the text */
563 UBiDiLevel level
, paraLevel
=pBiDi
->paraLevel
;
565 for(i
=0; i
<length
; ++i
) {
567 if(level
&UBIDI_LEVEL_OVERRIDE
) {
568 /* keep the override flag in levels[i] but adjust the flags */
569 level
&=~UBIDI_LEVEL_OVERRIDE
; /* make the range check below simpler */
570 flags
|=DIRPROP_FLAG_O(level
);
573 flags
|=DIRPROP_FLAG_E(level
)|DIRPROP_FLAG(dirProps
[i
]);
575 if(level
<paraLevel
|| UBIDI_MAX_EXPLICIT_LEVEL
<level
) {
576 /* level out of bounds */
577 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
581 if(flags
&MASK_EMBEDDING
) {
582 flags
|=DIRPROP_FLAG_LR(pBiDi
->paraLevel
);
585 /* determine if the text is mixed-directional or single-directional */
587 return directionFromFlags(flags
);
590 /* perform rules (Wn), (Nn), and (In) on a run of the text ------------------ */
593 * This implementation of the (Wn) rules applies all rules in one pass.
594 * In order to do so, it needs a look-ahead of typically 1 character
595 * (except for W5: sequences of ET) and keeps track of changes
596 * in a rule Wp that affect a later Wq (p<q).
598 * historyOfEN is a variable-saver: it contains 4 boolean states;
599 * a bit in it set to 1 means:
600 * bit 0: the current code is an EN after W2
601 * bit 1: the current code is an EN after W4
602 * bit 2: the previous code was an EN after W2
603 * bit 3: the previous code was an EN after W4
604 * In other words, b0..1 have transitions of EN in the current iteration,
605 * while b2..3 have the transitions of EN in the previous iteration.
606 * A simple historyOfEN<<=2 suffices for the propagation.
608 * The (Nn) and (In) rules are also performed in that same single loop,
609 * but effectively one iteration behind for white space.
611 * Since all implicit rules are performed in one step, it is not necessary
612 * to actually store the intermediate directional properties in dirProps[].
616 #define EN_AFTER_W2 1
617 #define EN_AFTER_W4 2
619 #define PREV_EN_AFTER_W2 4
620 #define PREV_EN_AFTER_W4 8
623 resolveImplicitLevels(UBiDi
*pBiDi
,
624 int32_t start
, int32_t limit
,
625 DirProp sor
, DirProp eor
) {
626 const DirProp
*dirProps
=pBiDi
->dirProps
;
627 UBiDiLevel
*levels
=pBiDi
->levels
;
629 int32_t i
, next
, neutralStart
=-1;
630 DirProp prevDirProp
, dirProp
, nextDirProp
, lastStrong
, beforeNeutral
=L
;
631 UBiDiLevel numberLevel
;
634 /* initialize: current at sor, next at start (it is start<limit) */
636 dirProp
=lastStrong
=sor
;
637 nextDirProp
=dirProps
[next
];
640 if(pBiDi
->isInverse
) {
642 * For "inverse BiDi", we set the levels of numbers just like for
643 * regular L characters, plus a flag that ubidi_getRuns() will use
644 * to set a similar flag on the corresponding output run.
646 numberLevel
=levels
[start
];
651 /* normal BiDi: least greater even level */
652 numberLevel
=(UBiDiLevel
)((levels
[start
]+2)&~1);
656 * In all steps of this implementation, BN and explicit embedding codes
657 * must be treated as if they didn't exist (X9).
658 * They will get levels set before a non-neutral character, and remain
659 * undefined before a neutral one, but adjustWSLevels() will take care
662 while(DIRPROP_FLAG(nextDirProp
)&MASK_BN_EXPLICIT
) {
664 nextDirProp
=dirProps
[next
];
672 * Note: at the end of this file, there is a prototype
673 * of a version of this function that uses a statetable
674 * at the core of this state machine.
675 * If you make changes to this state machine,
676 * please update that prototype as well.
679 /* loop for entire run */
687 nextDirProp
=dirProps
[next
];
692 } while(DIRPROP_FLAG(nextDirProp
)&MASK_BN_EXPLICIT
);
693 historyOfEN
<<=EN_SHIFT
;
709 /* we have to set historyOfEN correctly */
718 /* this EN stays after (W2) and (W4) - at least before (W7) */
723 if( historyOfEN
&PREV_EN_AFTER_W2
&& /* previous was EN before (W4) */
724 nextDirProp
==EN
&& lastStrong
!=AL
/* next is EN and (W2) won't make it AN */
733 historyOfEN
|=EN_AFTER_W4
;
740 if( historyOfEN
&PREV_EN_AFTER_W2
&& /* previous was EN before (W4) */
741 nextDirProp
==EN
&& lastStrong
!=AL
/* next is EN and (W2) won't make it AN */
750 historyOfEN
|=EN_AFTER_W4
;
751 } else if(prevDirProp
==AN
&& /* previous was AN */
752 (nextDirProp
==AN
|| /* next is AN */
753 (nextDirProp
==EN
&& lastStrong
==AL
)) /* or (W2) will make it one */
763 /* get sequence of ET; advance only next, not current, previous or historyOfEN */
765 while(DIRPROP_FLAG(nextDirProp
)&MASK_ET_NSM_BN
/* (W1), (X9) */) {
767 nextDirProp
=dirProps
[next
];
775 /* now process the sequence of ET like a single ET */
776 if((historyOfEN
&PREV_EN_AFTER_W4
) || /* previous was EN before (W5) */
777 (nextDirProp
==EN
&& lastStrong
!=AL
) /* next is EN and (W2) won't make it AN */
791 /* apply the result of (W1), (W5)..(W7) to the entire sequence of ET */
796 /* set historyOfEN back to prevDirProp's historyOfEN */
797 historyOfEN
>>=EN_SHIFT
;
799 * Technically, this should be done before the switch() in the form
800 * if(nextDirProp==NSM) {
801 * dirProps[next]=nextDirProp=dirProp;
804 * - effectively one iteration ahead.
805 * However, whether the next dirProp is NSM or is equal to the current dirProp
806 * does not change the outcome of any condition in (W2)..(W7).
813 /* here, it is always [prev,this,next]dirProp!=BN; it may be next>i+1 */
815 /* perform (Nn) - here, only L, R, EN, AN, and neutrals are left */
816 /* for "inverse BiDi", treat neutrals like L */
817 /* this is one iteration late for the neutrals */
818 if(DIRPROP_FLAG(dirProp
)&MASK_N
) {
820 /* start of a sequence of neutrals */
822 beforeNeutral
=prevDirProp
;
824 } else /* not a neutral, can be only one of { L, R, EN, AN } */ {
826 * Note that all levels[] values are still the same at this
827 * point because this function is called for an entire
829 * Therefore, we need to read only one actual level.
831 UBiDiLevel level
=levels
[i
];
833 if(neutralStart
>=0) {
835 /* end of a sequence of neutrals (dirProp is "afterNeutral") */
836 if(!(pBiDi
->isInverse
)) {
837 if(beforeNeutral
==L
) {
839 final
=0; /* make all neutrals L (N1) */
841 final
=level
; /* make all neutrals "e" (N2) */
843 } else /* beforeNeutral is one of { R, EN, AN } */ {
845 final
=level
; /* make all neutrals "e" (N2) */
847 final
=1; /* make all neutrals R (N1) */
851 /* "inverse BiDi": collapse [before]dirProps L, EN, AN into L */
852 if(beforeNeutral
!=R
) {
854 final
=0; /* make all neutrals L (N1) */
856 final
=level
; /* make all neutrals "e" (N2) */
858 } else /* beforeNeutral is one of { R, EN, AN } */ {
860 final
=level
; /* make all neutrals "e" (N2) */
862 final
=1; /* make all neutrals R (N1) */
866 /* perform (In) on the sequence of neutrals */
867 if((level
^final
)&1) {
868 /* do something only if we need to _change_ the level */
870 ++levels
[neutralStart
];
871 } while(++neutralStart
<i
);
876 /* perform (In) on the non-neutral character */
878 * in the cases of (W5), processing a sequence of ET,
879 * and of (X9), skipping BN,
880 * there may be multiple characters from i to <next
881 * that all get (virtually) the same dirProp and (really) the same level
887 i
=next
; /* we keep the levels */
889 } else if(dirProp
==R
) {
893 i
=next
; /* we keep the levels */
895 } else /* EN or AN */ {
896 /* this level depends on whether we do "inverse BiDi" */
900 /* apply the new level to the sequence, if necessary */
907 /* perform (Nn) - here,
908 the character after the the neutrals is eor, which is either L or R */
909 /* this is one iteration late for the neutrals */
910 if(neutralStart
>=0) {
912 * Note that all levels[] values are still the same at this
913 * point because this function is called for an entire
915 * Therefore, we need to read only one actual level.
917 UBiDiLevel level
=levels
[neutralStart
], final
;
919 /* end of a sequence of neutrals (eor is "afterNeutral") */
920 if(!(pBiDi
->isInverse
)) {
921 if(beforeNeutral
==L
) {
923 final
=0; /* make all neutrals L (N1) */
925 final
=level
; /* make all neutrals "e" (N2) */
927 } else /* beforeNeutral is one of { R, EN, AN } */ {
929 final
=level
; /* make all neutrals "e" (N2) */
931 final
=1; /* make all neutrals R (N1) */
935 /* "inverse BiDi": collapse [before]dirProps L, EN, AN into L */
936 if(beforeNeutral
!=R
) {
938 final
=0; /* make all neutrals L (N1) */
940 final
=level
; /* make all neutrals "e" (N2) */
942 } else /* beforeNeutral is one of { R, EN, AN } */ {
944 final
=level
; /* make all neutrals "e" (N2) */
946 final
=1; /* make all neutrals R (N1) */
950 /* perform (In) on the sequence of neutrals */
951 if((level
^final
)&1) {
952 /* do something only if we need to _change_ the level */
954 ++levels
[neutralStart
];
955 } while(++neutralStart
<limit
);
960 /* perform (L1) and (X9) ---------------------------------------------------- */
963 * Reset the embedding levels for some non-graphic characters (L1).
964 * This function also sets appropriate levels for BN, and
965 * explicit embedding types that are supposed to have been removed
966 * from the paragraph in (X9).
969 adjustWSLevels(UBiDi
*pBiDi
) {
970 const DirProp
*dirProps
=pBiDi
->dirProps
;
971 UBiDiLevel
*levels
=pBiDi
->levels
;
974 if(pBiDi
->flags
&MASK_WS
) {
975 UBiDiLevel paraLevel
=pBiDi
->paraLevel
;
978 i
=pBiDi
->trailingWSStart
;
980 /* reset a sequence of WS/BN before eop and B/S to the paragraph paraLevel */
981 while(i
>0 && DIRPROP_FLAG(dirProps
[--i
])&MASK_WS
) {
985 /* reset BN to the next character's paraLevel until B/S, which restarts above loop */
986 /* here, i+1 is guaranteed to be <length */
988 flag
=DIRPROP_FLAG(dirProps
[--i
]);
989 if(flag
&MASK_BN_EXPLICIT
) {
990 levels
[i
]=levels
[i
+1];
991 } else if(flag
&MASK_B_S
) {
1000 /* ubidi_setPara ------------------------------------------------------------ */
1002 U_CAPI
void U_EXPORT2
1003 ubidi_setPara(UBiDi
*pBiDi
, const UChar
*text
, int32_t length
,
1004 UBiDiLevel paraLevel
, UBiDiLevel
*embeddingLevels
,
1005 UErrorCode
*pErrorCode
) {
1006 UBiDiDirection direction
;
1008 /* check the argument values */
1009 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1011 } else if(pBiDi
==NULL
|| text
==NULL
||
1012 ((UBIDI_MAX_EXPLICIT_LEVEL
<paraLevel
) && !IS_DEFAULT_LEVEL(paraLevel
)) ||
1015 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1020 length
=u_strlen(text
);
1023 /* initialize the UBiDi structure */
1025 pBiDi
->length
=length
;
1026 pBiDi
->paraLevel
=paraLevel
;
1027 pBiDi
->direction
=UBIDI_LTR
;
1028 pBiDi
->trailingWSStart
=length
; /* the levels[] will reflect the WS run */
1030 pBiDi
->dirProps
=NULL
;
1036 * For an empty paragraph, create a UBiDi object with the paraLevel and
1037 * the flags and the direction set but without allocating zero-length arrays.
1038 * There is nothing more to do.
1040 if(IS_DEFAULT_LEVEL(paraLevel
)) {
1041 pBiDi
->paraLevel
&=1;
1044 pBiDi
->flags
=DIRPROP_FLAG(R
);
1045 pBiDi
->direction
=UBIDI_RTL
;
1047 pBiDi
->flags
=DIRPROP_FLAG(L
);
1048 pBiDi
->direction
=UBIDI_LTR
;
1058 * Get the directional properties,
1059 * the flags bit-set, and
1060 * determine the partagraph level if necessary.
1062 if(getDirPropsMemory(pBiDi
, length
)) {
1063 pBiDi
->dirProps
=pBiDi
->dirPropsMemory
;
1064 getDirProps(pBiDi
, text
);
1066 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1070 /* are explicit levels specified? */
1071 if(embeddingLevels
==NULL
) {
1072 /* no: determine explicit levels according to the (Xn) rules */\
1073 if(getLevelsMemory(pBiDi
, length
)) {
1074 pBiDi
->levels
=pBiDi
->levelsMemory
;
1075 direction
=resolveExplicitLevels(pBiDi
);
1077 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1081 /* set BN for all explicit codes, check that all levels are paraLevel..UBIDI_MAX_EXPLICIT_LEVEL */
1082 pBiDi
->levels
=embeddingLevels
;
1083 direction
=checkExplicitLevels(pBiDi
, pErrorCode
);
1084 if(U_FAILURE(*pErrorCode
)) {
1090 * The steps after (X9) in the UBiDi algorithm are performed only if
1091 * the paragraph text has mixed directionality!
1093 pBiDi
->direction
=direction
;
1096 /* make sure paraLevel is even */
1097 pBiDi
->paraLevel
=(UBiDiLevel
)((pBiDi
->paraLevel
+1)&~1);
1099 /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
1100 pBiDi
->trailingWSStart
=0;
1103 /* make sure paraLevel is odd */
1104 pBiDi
->paraLevel
|=1;
1106 /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
1107 pBiDi
->trailingWSStart
=0;
1111 * If there are no external levels specified and there
1112 * are no significant explicit level codes in the text,
1113 * then we can treat the entire paragraph as one run.
1114 * Otherwise, we need to perform the following rules on runs of
1115 * the text with the same embedding levels. (X10)
1116 * "Significant" explicit level codes are ones that actually
1117 * affect non-BN characters.
1118 * Examples for "insignificant" ones are empty embeddings
1119 * LRE-PDF, LRE-RLE-PDF-PDF, etc.
1121 if(embeddingLevels
==NULL
&& !(pBiDi
->flags
&DIRPROP_FLAG_MULTI_RUNS
)) {
1122 resolveImplicitLevels(pBiDi
, 0, length
,
1123 GET_LR_FROM_LEVEL(pBiDi
->paraLevel
),
1124 GET_LR_FROM_LEVEL(pBiDi
->paraLevel
));
1126 /* sor, eor: start and end types of same-level-run */
1127 UBiDiLevel
*levels
=pBiDi
->levels
;
1128 int32_t start
, limit
=0;
1129 UBiDiLevel level
, nextLevel
;
1132 /* determine the first sor and set eor to it because of the loop body (sor=eor there) */
1133 level
=pBiDi
->paraLevel
;
1134 nextLevel
=levels
[0];
1135 if(level
<nextLevel
) {
1136 eor
=GET_LR_FROM_LEVEL(nextLevel
);
1138 eor
=GET_LR_FROM_LEVEL(level
);
1142 /* determine start and limit of the run (end points just behind the run) */
1144 /* the values for this run's start are the same as for the previous run's end */
1149 /* search for the limit of this run */
1150 while(++limit
<length
&& levels
[limit
]==level
) {}
1152 /* get the correct level of the next run */
1154 nextLevel
=levels
[limit
];
1156 nextLevel
=pBiDi
->paraLevel
;
1159 /* determine eor from max(level, nextLevel); sor is last run's eor */
1160 if((level
&~UBIDI_LEVEL_OVERRIDE
)<(nextLevel
&~UBIDI_LEVEL_OVERRIDE
)) {
1161 eor
=GET_LR_FROM_LEVEL(nextLevel
);
1163 eor
=GET_LR_FROM_LEVEL(level
);
1166 /* if the run consists of overridden directional types, then there
1167 are no implicit types to be resolved */
1168 if(!(level
&UBIDI_LEVEL_OVERRIDE
)) {
1169 resolveImplicitLevels(pBiDi
, start
, limit
, sor
, eor
);
1171 /* remove the UBIDI_LEVEL_OVERRIDE flags */
1173 levels
[start
++]&=~UBIDI_LEVEL_OVERRIDE
;
1174 } while(start
<limit
);
1176 } while(limit
<length
);
1179 /* reset the embedding levels for some non-graphic characters (L1), (X9) */
1180 adjustWSLevels(pBiDi
);
1182 /* for "inverse BiDi", ubidi_getRuns() modifies the levels of numeric runs following RTL runs */
1183 if(pBiDi
->isInverse
) {
1184 if(!ubidi_getRuns(pBiDi
)) {
1185 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1193 U_CAPI UBiDiDirection U_EXPORT2
1194 ubidi_getDirection(const UBiDi
*pBiDi
) {
1196 return pBiDi
->direction
;
1202 U_CAPI
const UChar
* U_EXPORT2
1203 ubidi_getText(const UBiDi
*pBiDi
) {
1211 U_CAPI
int32_t U_EXPORT2
1212 ubidi_getLength(const UBiDi
*pBiDi
) {
1214 return pBiDi
->length
;
1220 U_CAPI UBiDiLevel U_EXPORT2
1221 ubidi_getParaLevel(const UBiDi
*pBiDi
) {
1223 return pBiDi
->paraLevel
;
1229 /* statetable prototype ----------------------------------------------------- */
1232 * This is here for possible future
1233 * performance work and is not compiled right now.
1238 * This is a piece of code that could be part of ubidi.c/resolveImplicitLevels().
1239 * It replaces in the (Wn) state machine the switch()-if()-cascade with
1240 * just a few if()s and a state table.
1243 /* use the state table only for the following dirProp's */
1244 #define MASK_W_TABLE (FLAG(L)|FLAG(R)|FLAG(AL)|FLAG(EN)|FLAG(ES)|FLAG(CS)|FLAG(ET)|FLAG(AN))
1249 * 0..1 historyOfEN - 2b
1250 * 2 prevDirProp==AN - 1b
1251 * 3..4 lastStrong, one of { L, R, AL, none } - 2b
1252 * 5..7 dirProp, one of { L, R, AL, EN, ES, CS, ET, AN } - 3b
1253 * 8..9 nextDirProp, one of { EN, AN, other }
1255 * total: 10b=1024 states
1257 enum { _L
, _R
, _AL
, _EN
, _ES
, _CS
, _ET
, _AN
, _OTHER
}; /* lastStrong, dirProp */
1258 enum { __EN
, __AN
, __OTHER
}; /* nextDirProp */
1260 #define LAST_STRONG_SHIFT 3
1261 #define DIR_PROP_SHIFT 5
1262 #define NEXT_DIR_PROP_SHIFT 8
1264 /* masks after shifting */
1265 #define LAST_STRONG_MASK 3
1266 #define DIR_PROP_MASK 7
1267 #define STATE_MASK 0x1f
1269 /* convert dirProp into _dirProp (above enum) */
1270 static DirProp inputDirProp
[dirPropCount
]={ _X
<<DIR_PROP_SHIFT
, ... };
1272 /* convert dirProp into __dirProp (above enum) */
1273 static DirProp inputNextDirProp
[dirPropCount
]={ __X
<<NEXT_DIR_PROP_SHIFT
, ... };
1278 * dirProp, one of { L, R, EN, AN, ON } - 3b
1280 * 0..1 historyOfEN - 2b
1281 * 2 prevDirProp==AN - 1b
1282 * 3..4 lastStrong, one of { L, R, AL, none } - 2b
1283 * 5..7 new dirProp, one of { L, R, EN, AN, ON }
1285 * total: 8 bits=1 byte per state
1287 enum { ___L
, ___R
, ___EN
, ___AN
, ___ON
, ___count
};
1289 /* convert ___dirProp into dirProp (above enum) */
1290 static DirProp outputDirProp
[___count
]={ X
, ... };
1293 static uint8_t wnTable
[1024]={ /* calculate with switch()-if()-cascade */ };
1296 resolveImplicitLevels(BiDi
*pBiDi
,
1297 Index start
, Index end
,
1298 DirProp sor
, DirProp eor
) {
1302 /* remove variable lastStrong */
1304 /* set initial state (set lastStrong, the rest is 0) */
1305 state
= sor
==L
? 0 : _R
<<LAST_STRONG_SHIFT
;
1309 prevDirProp
=dirProp
;
1310 dirProp
=nextDirProp
;
1314 nextDirProp
=dirProps
[next
];
1319 } while(FLAG(nextDirProp
)&MASK_BN_EXPLICIT
);
1322 /* ### This may be more efficient with a switch(dirProp). */
1323 if(FLAG(dirProp
)&MASK_W_TABLE
) {
1326 inputDirProp
[dirProp
]|
1327 inputNextDirProp
[nextDirProp
]
1329 dirProp
=outputDirProp
[state
>>DIR_PROP_SHIFT
];
1331 } else if(dirProp
==ET
) {
1332 /* get sequence of ET; advance only next, not current, previous or historyOfEN */
1333 while(next
<limit
&& FLAG(nextDirProp
)&MASK_ET_NSM_BN
/* (W1), (X9) */) {
1335 nextDirProp
=dirProps
[next
];
1344 _ET
<<DIR_PROP_SHIFT
|
1345 inputNextDirProp
[nextDirProp
]
1347 dirProp
=outputDirProp
[state
>>DIR_PROP_SHIFT
];
1350 /* apply the result of (W1), (W5)..(W7) to the entire sequence of ET */
1351 } else if(dirProp
==NSM
) {
1353 dirProp
=prevDirProp
;
1354 /* keep prevDirProp's EN and AN states! */
1355 } else /* other */ {
1356 /* set EN and AN states to 0 */
1357 state
&=LAST_STRONG_MASK
<<LAST_STRONG_SHIFT
;
1360 /* perform (Nn) and (In) as usual */
1362 /* perform (Nn) and (In) as usual */