2 **********************************************************************
3 * Copyright (C) 1999-2002, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
9 * Modification History:
11 * Date Name Description
12 * 07/08/2002 Eric Mader Creation.
13 ******************************************************************************
16 #include "unicode/uscript.h"
20 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
22 #define PAREN_STACK_DEPTH 128
24 struct ParenStackEntry
27 UScriptCode scriptCode
;
33 const UChar
*textArray
;
37 UScriptCode scriptCode
;
39 struct ParenStackEntry parenStack
[PAREN_STACK_DEPTH
];
43 static int8_t highBit(int32_t value
);
45 static const UChar32 pairedChars
[] = {
46 0x0028, 0x0029, /* ascii paired punctuation */
50 0x00ab, 0x00bb, /* guillemets */
51 0x2018, 0x2019, /* general punctuation */
54 0x3008, 0x3009, /* chinese paired punctuation */
66 highBit(int32_t value
)
74 if (value
>= 1 << 16) {
79 if (value
>= 1 << 8) {
84 if (value
>= 1 << 4) {
89 if (value
>= 1 << 2) {
94 if (value
>= 1 << 1) {
103 getPairIndex(UChar32 ch
)
105 int32_t pairedCharCount
= ARRAY_SIZE(pairedChars
);
106 int32_t pairedCharPower
= 1 << highBit(pairedCharCount
);
107 int32_t pairedCharExtra
= pairedCharCount
- pairedCharPower
;
109 int32_t probe
= pairedCharPower
;
112 if (ch
>= pairedChars
[pairedCharExtra
]) {
113 index
= pairedCharExtra
;
116 while (probe
> (1 << 0)) {
119 if (ch
>= pairedChars
[index
+ probe
]) {
124 if (pairedChars
[index
] != ch
) {
132 sameScript(UScriptCode scriptOne
, UScriptCode scriptTwo
)
134 return scriptOne
<= USCRIPT_INHERITED
|| scriptTwo
<= USCRIPT_INHERITED
|| scriptOne
== scriptTwo
;
137 U_CAPI UScriptRun
* U_EXPORT2
138 uscript_openRun(const UChar
*src
, int32_t length
, UErrorCode
*pErrorCode
)
140 UScriptRun
*result
= NULL
;
142 if (pErrorCode
== NULL
|| U_FAILURE(*pErrorCode
)) {
146 result
= uprv_malloc(sizeof (UScriptRun
));
148 if (result
== NULL
) {
149 *pErrorCode
= U_MEMORY_ALLOCATION_ERROR
;
153 uscript_setRunText(result
, src
, length
, pErrorCode
);
155 /* Release the UScriptRun if uscript_setRunText() returns an error */
156 if (U_FAILURE(*pErrorCode
)) {
164 U_CAPI
void U_EXPORT2
165 uscript_closeRun(UScriptRun
*scriptRun
)
167 if (scriptRun
!= NULL
) {
168 uprv_free(scriptRun
);
172 U_CAPI
void U_EXPORT2
173 uscript_resetRun(UScriptRun
*scriptRun
)
175 if (scriptRun
!= NULL
) {
176 scriptRun
->scriptStart
= 0;
177 scriptRun
->scriptLimit
= 0;
178 scriptRun
->scriptCode
= USCRIPT_INVALID_CODE
;
179 scriptRun
->parenSP
= -1;
183 U_CAPI
void U_EXPORT2
184 uscript_setRunText(UScriptRun
*scriptRun
, const UChar
*src
, int32_t length
, UErrorCode
*pErrorCode
)
186 if (pErrorCode
== NULL
|| U_FAILURE(*pErrorCode
)) {
190 if (scriptRun
== NULL
|| length
< 0 || ((src
== NULL
) != (length
== 0))) {
191 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
195 scriptRun
->textArray
= src
;
196 scriptRun
->textLength
= length
;
198 uscript_resetRun(scriptRun
);
201 U_CAPI UBool U_EXPORT2
202 uscript_nextRun(UScriptRun
*scriptRun
, int32_t *pRunStart
, int32_t *pRunLimit
, UScriptCode
*pRunScript
)
204 int32_t startSP
= -1; /* used to find the first new open character */
205 UErrorCode error
= U_ZERO_ERROR
;
207 /* if we've fallen off the end of the text, we're done */
208 if (scriptRun
== NULL
|| scriptRun
->scriptLimit
>= scriptRun
->textLength
) {
212 startSP
= scriptRun
->parenSP
;
213 scriptRun
->scriptCode
= USCRIPT_COMMON
;
215 for (scriptRun
->scriptStart
= scriptRun
->scriptLimit
; scriptRun
->scriptLimit
< scriptRun
->textLength
; scriptRun
->scriptLimit
+= 1) {
216 UChar high
= scriptRun
->textArray
[scriptRun
->scriptLimit
];
222 * if the character is a high surrogate and it's not the last one
223 * in the text, see if it's followed by a low surrogate
225 if (high
>= 0xD800 && high
<= 0xDBFF && scriptRun
->scriptLimit
< scriptRun
->textLength
- 1) {
226 UChar low
= scriptRun
->textArray
[scriptRun
->scriptLimit
+ 1];
229 * if it is followed by a low surrogate,
230 * consume it and form the full character
232 if (low
>= 0xDC00 && low
<= 0xDFFF) {
233 ch
= (high
- 0xD800) * 0x0400 + low
- 0xDC00 + 0x10000;
234 scriptRun
->scriptLimit
+= 1;
238 sc
= uscript_getScript(ch
, &error
);
239 pairIndex
= getPairIndex(ch
);
242 * Paired character handling:
244 * if it's an open character, push it onto the stack.
245 * if it's a close character, find the matching open on the
246 * stack, and use that script code. Any non-matching open
247 * characters above it on the stack will be poped.
249 if (pairIndex
>= 0) {
250 if ((pairIndex
& 1) == 0) {
253 * If the paren stack is full, empty it. This
254 * means that deeply nested paired punctuation
255 * characters will be ignored, but that's an unusual
256 * case, and it's better to ignore them than to
257 * write off the end of the stack...
259 if (++scriptRun
->parenSP
>= PAREN_STACK_DEPTH
) {
260 scriptRun
->parenSP
= 0;
263 scriptRun
->parenStack
[scriptRun
->parenSP
].pairIndex
= pairIndex
;
264 scriptRun
->parenStack
[scriptRun
->parenSP
].scriptCode
= scriptRun
->scriptCode
;
265 } else if (scriptRun
->parenSP
>= 0) {
266 int32_t pi
= pairIndex
& ~1;
268 while (scriptRun
->parenSP
>= 0 && scriptRun
->parenStack
[scriptRun
->parenSP
].pairIndex
!= pi
) {
269 scriptRun
->parenSP
-= 1;
272 if (scriptRun
->parenSP
< startSP
) {
273 startSP
= scriptRun
->parenSP
;
276 if (scriptRun
->parenSP
>= 0) {
277 sc
= scriptRun
->parenStack
[scriptRun
->parenSP
].scriptCode
;
282 if (sameScript(scriptRun
->scriptCode
, sc
)) {
283 if (scriptRun
->scriptCode
<= USCRIPT_INHERITED
&& sc
> USCRIPT_INHERITED
) {
284 scriptRun
->scriptCode
= sc
;
287 * now that we have a final script code, fix any open
288 * characters we pushed before we knew the script code.
290 while (startSP
< scriptRun
->parenSP
) {
291 scriptRun
->parenStack
[++startSP
].scriptCode
= scriptRun
->scriptCode
;
296 * if this character is a close paired character,
297 * pop it from the stack
299 if (pairIndex
>= 0 && (pairIndex
& 1) != 0 && scriptRun
->parenSP
>= 0) {
300 scriptRun
->parenSP
-= 1;
305 * if the run broke on a surrogate pair,
306 * end it before the high surrogate
309 scriptRun
->scriptLimit
-= 1;
317 if (pRunStart
!= NULL
) {
318 *pRunStart
= scriptRun
->scriptStart
;
321 if (pRunLimit
!= NULL
) {
322 *pRunLimit
= scriptRun
->scriptLimit
;
325 if (pRunScript
!= NULL
) {
326 *pRunScript
= scriptRun
->scriptCode
;