]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/usc_impl.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / usc_impl.c
1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2002, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * File USC_IMPL.C
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 07/08/2002 Eric Mader Creation.
13 ******************************************************************************
14 */
15
16 #include "unicode/uscript.h"
17 #include "usc_impl.h"
18 #include "cmemory.h"
19
20 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
21
22 #define PAREN_STACK_DEPTH 128
23
24 struct ParenStackEntry
25 {
26 int32_t pairIndex;
27 UScriptCode scriptCode;
28 };
29
30 struct UScriptRun
31 {
32 int32_t textLength;
33 const UChar *textArray;
34
35 int32_t scriptStart;
36 int32_t scriptLimit;
37 UScriptCode scriptCode;
38
39 struct ParenStackEntry parenStack[PAREN_STACK_DEPTH];
40 int32_t parenSP;
41 };
42
43 static int8_t highBit(int32_t value);
44
45 static const UChar32 pairedChars[] = {
46 0x0028, 0x0029, /* ascii paired punctuation */
47 0x003c, 0x003e,
48 0x005b, 0x005d,
49 0x007b, 0x007d,
50 0x00ab, 0x00bb, /* guillemets */
51 0x2018, 0x2019, /* general punctuation */
52 0x201c, 0x201d,
53 0x2039, 0x203a,
54 0x3008, 0x3009, /* chinese paired punctuation */
55 0x300a, 0x300b,
56 0x300c, 0x300d,
57 0x300e, 0x300f,
58 0x3010, 0x3011,
59 0x3014, 0x3015,
60 0x3016, 0x3017,
61 0x3018, 0x3019,
62 0x301a, 0x301b
63 };
64
65 static int8_t
66 highBit(int32_t value)
67 {
68 int8_t bit = 0;
69
70 if (value <= 0) {
71 return -32;
72 }
73
74 if (value >= 1 << 16) {
75 value >>= 16;
76 bit += 16;
77 }
78
79 if (value >= 1 << 8) {
80 value >>= 8;
81 bit += 8;
82 }
83
84 if (value >= 1 << 4) {
85 value >>= 4;
86 bit += 4;
87 }
88
89 if (value >= 1 << 2) {
90 value >>= 2;
91 bit += 2;
92 }
93
94 if (value >= 1 << 1) {
95 value >>= 1;
96 bit += 1;
97 }
98
99 return bit;
100 }
101
102 static int32_t
103 getPairIndex(UChar32 ch)
104 {
105 int32_t pairedCharCount = ARRAY_SIZE(pairedChars);
106 int32_t pairedCharPower = 1 << highBit(pairedCharCount);
107 int32_t pairedCharExtra = pairedCharCount - pairedCharPower;
108
109 int32_t probe = pairedCharPower;
110 int32_t index = 0;
111
112 if (ch >= pairedChars[pairedCharExtra]) {
113 index = pairedCharExtra;
114 }
115
116 while (probe > (1 << 0)) {
117 probe >>= 1;
118
119 if (ch >= pairedChars[index + probe]) {
120 index += probe;
121 }
122 }
123
124 if (pairedChars[index] != ch) {
125 index = -1;
126 }
127
128 return index;
129 }
130
131 static UBool
132 sameScript(UScriptCode scriptOne, UScriptCode scriptTwo)
133 {
134 return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
135 }
136
137 U_CAPI UScriptRun * U_EXPORT2
138 uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode)
139 {
140 UScriptRun *result = NULL;
141
142 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
143 return NULL;
144 }
145
146 result = uprv_malloc(sizeof (UScriptRun));
147
148 if (result == NULL) {
149 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
150 return NULL;
151 }
152
153 uscript_setRunText(result, src, length, pErrorCode);
154
155 /* Release the UScriptRun if uscript_setRunText() returns an error */
156 if (U_FAILURE(*pErrorCode)) {
157 uprv_free(result);
158 result = NULL;
159 }
160
161 return result;
162 }
163
164 U_CAPI void U_EXPORT2
165 uscript_closeRun(UScriptRun *scriptRun)
166 {
167 if (scriptRun != NULL) {
168 uprv_free(scriptRun);
169 }
170 }
171
172 U_CAPI void U_EXPORT2
173 uscript_resetRun(UScriptRun *scriptRun)
174 {
175 if (scriptRun != NULL) {
176 scriptRun->scriptStart = 0;
177 scriptRun->scriptLimit = 0;
178 scriptRun->scriptCode = USCRIPT_INVALID_CODE;
179 scriptRun->parenSP = -1;
180 }
181 }
182
183 U_CAPI void U_EXPORT2
184 uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode)
185 {
186 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
187 return;
188 }
189
190 if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) {
191 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
192 return;
193 }
194
195 scriptRun->textArray = src;
196 scriptRun->textLength = length;
197
198 uscript_resetRun(scriptRun);
199 }
200
201 U_CAPI UBool U_EXPORT2
202 uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript)
203 {
204 int32_t startSP = -1; /* used to find the first new open character */
205 UErrorCode error = U_ZERO_ERROR;
206
207 /* if we've fallen off the end of the text, we're done */
208 if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) {
209 return FALSE;
210 }
211
212 startSP = scriptRun->parenSP;
213 scriptRun->scriptCode = USCRIPT_COMMON;
214
215 for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) {
216 UChar high = scriptRun->textArray[scriptRun->scriptLimit];
217 UChar32 ch = high;
218 UScriptCode sc;
219 int32_t pairIndex;
220
221 /*
222 * if the character is a high surrogate and it's not the last one
223 * in the text, see if it's followed by a low surrogate
224 */
225 if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) {
226 UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1];
227
228 /*
229 * if it is followed by a low surrogate,
230 * consume it and form the full character
231 */
232 if (low >= 0xDC00 && low <= 0xDFFF) {
233 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
234 scriptRun->scriptLimit += 1;
235 }
236 }
237
238 sc = uscript_getScript(ch, &error);
239 pairIndex = getPairIndex(ch);
240
241 /*
242 * Paired character handling:
243 *
244 * if it's an open character, push it onto the stack.
245 * if it's a close character, find the matching open on the
246 * stack, and use that script code. Any non-matching open
247 * characters above it on the stack will be poped.
248 */
249 if (pairIndex >= 0) {
250 if ((pairIndex & 1) == 0) {
251
252 /*
253 * If the paren stack is full, empty it. This
254 * means that deeply nested paired punctuation
255 * characters will be ignored, but that's an unusual
256 * case, and it's better to ignore them than to
257 * write off the end of the stack...
258 */
259 if (++scriptRun->parenSP >= PAREN_STACK_DEPTH) {
260 scriptRun->parenSP = 0;
261 }
262
263 scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex;
264 scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptRun->scriptCode;
265 } else if (scriptRun->parenSP >= 0) {
266 int32_t pi = pairIndex & ~1;
267
268 while (scriptRun->parenSP >= 0 && scriptRun->parenStack[scriptRun->parenSP].pairIndex != pi) {
269 scriptRun->parenSP -= 1;
270 }
271
272 if (scriptRun->parenSP < startSP) {
273 startSP = scriptRun->parenSP;
274 }
275
276 if (scriptRun->parenSP >= 0) {
277 sc = scriptRun->parenStack[scriptRun->parenSP].scriptCode;
278 }
279 }
280 }
281
282 if (sameScript(scriptRun->scriptCode, sc)) {
283 if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
284 scriptRun->scriptCode = sc;
285
286 /*
287 * now that we have a final script code, fix any open
288 * characters we pushed before we knew the script code.
289 */
290 while (startSP < scriptRun->parenSP) {
291 scriptRun->parenStack[++startSP].scriptCode = scriptRun->scriptCode;
292 }
293 }
294
295 /*
296 * if this character is a close paired character,
297 * pop it from the stack
298 */
299 if (pairIndex >= 0 && (pairIndex & 1) != 0 && scriptRun->parenSP >= 0) {
300 scriptRun->parenSP -= 1;
301 startSP -= 1;
302 }
303 } else {
304 /*
305 * if the run broke on a surrogate pair,
306 * end it before the high surrogate
307 */
308 if (ch >= 0x10000) {
309 scriptRun->scriptLimit -= 1;
310 }
311
312 break;
313 }
314 }
315
316
317 if (pRunStart != NULL) {
318 *pRunStart = scriptRun->scriptStart;
319 }
320
321 if (pRunLimit != NULL) {
322 *pRunLimit = scriptRun->scriptLimit;
323 }
324
325 if (pRunScript != NULL) {
326 *pRunScript = scriptRun->scriptCode;
327 }
328
329 return TRUE;
330 }