1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 1999-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
11 * Modification History:
13 * Date Name Description
14 * 07/08/2002 Eric Mader Creation.
15 ******************************************************************************
18 #include "unicode/uscript.h"
22 #define PAREN_STACK_DEPTH 32
24 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
25 #define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH)
26 #define INC(sp,count) (MOD((sp) + (count)))
27 #define INC1(sp) (INC(sp, 1))
28 #define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count)))
29 #define DEC1(sp) (DEC(sp, 1))
30 #define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0)
31 #define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun))
32 #define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP])
33 #define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0)
35 struct ParenStackEntry
38 UScriptCode scriptCode
;
44 const UChar
*textArray
;
48 UScriptCode scriptCode
;
50 struct ParenStackEntry parenStack
[PAREN_STACK_DEPTH
];
56 static int8_t highBit(int32_t value
);
58 static const UChar32 pairedChars
[] = {
59 0x0028, 0x0029, /* ascii paired punctuation */
63 0x00ab, 0x00bb, /* guillemets */
64 0x2018, 0x2019, /* general punctuation */
67 0x3008, 0x3009, /* chinese paired punctuation */
78 static void push(UScriptRun
*scriptRun
, int32_t pairIndex
, UScriptCode scriptCode
)
80 scriptRun
->pushCount
= LIMIT_INC(scriptRun
->pushCount
);
81 scriptRun
->fixupCount
= LIMIT_INC(scriptRun
->fixupCount
);
83 scriptRun
->parenSP
= INC1(scriptRun
->parenSP
);
84 scriptRun
->parenStack
[scriptRun
->parenSP
].pairIndex
= pairIndex
;
85 scriptRun
->parenStack
[scriptRun
->parenSP
].scriptCode
= scriptCode
;
88 static void pop(UScriptRun
*scriptRun
)
90 if (STACK_IS_EMPTY(scriptRun
)) {
94 if (scriptRun
->fixupCount
> 0) {
95 scriptRun
->fixupCount
-= 1;
98 scriptRun
->pushCount
-= 1;
99 scriptRun
->parenSP
= DEC1(scriptRun
->parenSP
);
101 /* If the stack is now empty, reset the stack
102 pointers to their initial values.
104 if (STACK_IS_EMPTY(scriptRun
)) {
105 scriptRun
->parenSP
= -1;
109 static void fixup(UScriptRun
*scriptRun
, UScriptCode scriptCode
)
111 int32_t fixupSP
= DEC(scriptRun
->parenSP
, scriptRun
->fixupCount
);
113 while (scriptRun
->fixupCount
-- > 0) {
114 fixupSP
= INC1(fixupSP
);
115 scriptRun
->parenStack
[fixupSP
].scriptCode
= scriptCode
;
120 highBit(int32_t value
)
128 if (value
>= 1 << 16) {
133 if (value
>= 1 << 8) {
138 if (value
>= 1 << 4) {
143 if (value
>= 1 << 2) {
148 if (value
>= 1 << 1) {
157 getPairIndex(UChar32 ch
)
159 int32_t pairedCharCount
= UPRV_LENGTHOF(pairedChars
);
160 int32_t pairedCharPower
= 1 << highBit(pairedCharCount
);
161 int32_t pairedCharExtra
= pairedCharCount
- pairedCharPower
;
163 int32_t probe
= pairedCharPower
;
164 int32_t pairIndex
= 0;
166 if (ch
>= pairedChars
[pairedCharExtra
]) {
167 pairIndex
= pairedCharExtra
;
170 while (probe
> (1 << 0)) {
173 if (ch
>= pairedChars
[pairIndex
+ probe
]) {
178 if (pairedChars
[pairIndex
] != ch
) {
186 sameScript(UScriptCode scriptOne
, UScriptCode scriptTwo
)
188 return scriptOne
<= USCRIPT_INHERITED
|| scriptTwo
<= USCRIPT_INHERITED
|| scriptOne
== scriptTwo
;
191 U_CAPI UScriptRun
* U_EXPORT2
192 uscript_openRun(const UChar
*src
, int32_t length
, UErrorCode
*pErrorCode
)
194 UScriptRun
*result
= NULL
;
196 if (pErrorCode
== NULL
|| U_FAILURE(*pErrorCode
)) {
200 result
= (UScriptRun
*)uprv_malloc(sizeof (UScriptRun
));
202 if (result
== NULL
) {
203 *pErrorCode
= U_MEMORY_ALLOCATION_ERROR
;
207 uscript_setRunText(result
, src
, length
, pErrorCode
);
209 /* Release the UScriptRun if uscript_setRunText() returns an error */
210 if (U_FAILURE(*pErrorCode
)) {
218 U_CAPI
void U_EXPORT2
219 uscript_closeRun(UScriptRun
*scriptRun
)
221 if (scriptRun
!= NULL
) {
222 uprv_free(scriptRun
);
226 U_CAPI
void U_EXPORT2
227 uscript_resetRun(UScriptRun
*scriptRun
)
229 if (scriptRun
!= NULL
) {
230 scriptRun
->scriptStart
= 0;
231 scriptRun
->scriptLimit
= 0;
232 scriptRun
->scriptCode
= USCRIPT_INVALID_CODE
;
233 scriptRun
->parenSP
= -1;
234 scriptRun
->pushCount
= 0;
235 scriptRun
->fixupCount
= 0;
239 U_CAPI
void U_EXPORT2
240 uscript_setRunText(UScriptRun
*scriptRun
, const UChar
*src
, int32_t length
, UErrorCode
*pErrorCode
)
242 if (pErrorCode
== NULL
|| U_FAILURE(*pErrorCode
)) {
246 if (scriptRun
== NULL
|| length
< 0 || ((src
== NULL
) != (length
== 0))) {
247 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
251 scriptRun
->textArray
= src
;
252 scriptRun
->textLength
= length
;
254 uscript_resetRun(scriptRun
);
257 U_CAPI UBool U_EXPORT2
258 uscript_nextRun(UScriptRun
*scriptRun
, int32_t *pRunStart
, int32_t *pRunLimit
, UScriptCode
*pRunScript
)
260 UErrorCode error
= U_ZERO_ERROR
;
262 /* if we've fallen off the end of the text, we're done */
263 if (scriptRun
== NULL
|| scriptRun
->scriptLimit
>= scriptRun
->textLength
) {
267 SYNC_FIXUP(scriptRun
);
268 scriptRun
->scriptCode
= USCRIPT_COMMON
;
270 for (scriptRun
->scriptStart
= scriptRun
->scriptLimit
; scriptRun
->scriptLimit
< scriptRun
->textLength
; scriptRun
->scriptLimit
+= 1) {
271 UChar high
= scriptRun
->textArray
[scriptRun
->scriptLimit
];
277 * if the character is a high surrogate and it's not the last one
278 * in the text, see if it's followed by a low surrogate
280 if (high
>= 0xD800 && high
<= 0xDBFF && scriptRun
->scriptLimit
< scriptRun
->textLength
- 1) {
281 UChar low
= scriptRun
->textArray
[scriptRun
->scriptLimit
+ 1];
284 * if it is followed by a low surrogate,
285 * consume it and form the full character
287 if (low
>= 0xDC00 && low
<= 0xDFFF) {
288 ch
= (high
- 0xD800) * 0x0400 + low
- 0xDC00 + 0x10000;
289 scriptRun
->scriptLimit
+= 1;
293 sc
= uscript_getScript(ch
, &error
);
294 pairIndex
= getPairIndex(ch
);
297 * Paired character handling:
299 * if it's an open character, push it onto the stack.
300 * if it's a close character, find the matching open on the
301 * stack, and use that script code. Any non-matching open
302 * characters above it on the stack will be poped.
304 if (pairIndex
>= 0) {
305 if ((pairIndex
& 1) == 0) {
306 push(scriptRun
, pairIndex
, scriptRun
->scriptCode
);
308 int32_t pi
= pairIndex
& ~1;
310 while (STACK_IS_NOT_EMPTY(scriptRun
) && TOP(scriptRun
).pairIndex
!= pi
) {
314 if (STACK_IS_NOT_EMPTY(scriptRun
)) {
315 sc
= TOP(scriptRun
).scriptCode
;
320 if (sameScript(scriptRun
->scriptCode
, sc
)) {
321 if (scriptRun
->scriptCode
<= USCRIPT_INHERITED
&& sc
> USCRIPT_INHERITED
) {
322 scriptRun
->scriptCode
= sc
;
324 fixup(scriptRun
, scriptRun
->scriptCode
);
328 * if this character is a close paired character,
329 * pop the matching open character from the stack
331 if (pairIndex
>= 0 && (pairIndex
& 1) != 0) {
336 * if the run broke on a surrogate pair,
337 * end it before the high surrogate
340 scriptRun
->scriptLimit
-= 1;
348 if (pRunStart
!= NULL
) {
349 *pRunStart
= scriptRun
->scriptStart
;
352 if (pRunLimit
!= NULL
) {
353 *pRunLimit
= scriptRun
->scriptLimit
;
356 if (pRunScript
!= NULL
) {
357 *pRunScript
= scriptRun
->scriptCode
;