]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/usc_impl.c
ICU-57165.0.1.tar.gz
[apple/icu.git] / icuSources / common / usc_impl.c
1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * File USC_IMPL.C
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 07/08/2002 Eric Mader Creation.
13 ******************************************************************************
14 */
15
16 #include "unicode/uscript.h"
17 #include "usc_impl.h"
18 #include "cmemory.h"
19
20 #define PAREN_STACK_DEPTH 32
21
22 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
23 #define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH)
24 #define INC(sp,count) (MOD((sp) + (count)))
25 #define INC1(sp) (INC(sp, 1))
26 #define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count)))
27 #define DEC1(sp) (DEC(sp, 1))
28 #define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0)
29 #define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun))
30 #define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP])
31 #define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0)
32
33 struct ParenStackEntry
34 {
35 int32_t pairIndex;
36 UScriptCode scriptCode;
37 };
38
39 struct UScriptRun
40 {
41 int32_t textLength;
42 const UChar *textArray;
43
44 int32_t scriptStart;
45 int32_t scriptLimit;
46 UScriptCode scriptCode;
47
48 struct ParenStackEntry parenStack[PAREN_STACK_DEPTH];
49 int32_t parenSP;
50 int32_t pushCount;
51 int32_t fixupCount;
52 };
53
54 static int8_t highBit(int32_t value);
55
56 static const UChar32 pairedChars[] = {
57 0x0028, 0x0029, /* ascii paired punctuation */
58 0x003c, 0x003e,
59 0x005b, 0x005d,
60 0x007b, 0x007d,
61 0x00ab, 0x00bb, /* guillemets */
62 0x2018, 0x2019, /* general punctuation */
63 0x201c, 0x201d,
64 0x2039, 0x203a,
65 0x3008, 0x3009, /* chinese paired punctuation */
66 0x300a, 0x300b,
67 0x300c, 0x300d,
68 0x300e, 0x300f,
69 0x3010, 0x3011,
70 0x3014, 0x3015,
71 0x3016, 0x3017,
72 0x3018, 0x3019,
73 0x301a, 0x301b
74 };
75
76 static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode)
77 {
78 scriptRun->pushCount = LIMIT_INC(scriptRun->pushCount);
79 scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount);
80
81 scriptRun->parenSP = INC1(scriptRun->parenSP);
82 scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex;
83 scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode;
84 }
85
86 static void pop(UScriptRun *scriptRun)
87 {
88 if (STACK_IS_EMPTY(scriptRun)) {
89 return;
90 }
91
92 if (scriptRun->fixupCount > 0) {
93 scriptRun->fixupCount -= 1;
94 }
95
96 scriptRun->pushCount -= 1;
97 scriptRun->parenSP = DEC1(scriptRun->parenSP);
98
99 /* If the stack is now empty, reset the stack
100 pointers to their initial values.
101 */
102 if (STACK_IS_EMPTY(scriptRun)) {
103 scriptRun->parenSP = -1;
104 }
105 }
106
107 static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode)
108 {
109 int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount);
110
111 while (scriptRun->fixupCount-- > 0) {
112 fixupSP = INC1(fixupSP);
113 scriptRun->parenStack[fixupSP].scriptCode = scriptCode;
114 }
115 }
116
117 static int8_t
118 highBit(int32_t value)
119 {
120 int8_t bit = 0;
121
122 if (value <= 0) {
123 return -32;
124 }
125
126 if (value >= 1 << 16) {
127 value >>= 16;
128 bit += 16;
129 }
130
131 if (value >= 1 << 8) {
132 value >>= 8;
133 bit += 8;
134 }
135
136 if (value >= 1 << 4) {
137 value >>= 4;
138 bit += 4;
139 }
140
141 if (value >= 1 << 2) {
142 value >>= 2;
143 bit += 2;
144 }
145
146 if (value >= 1 << 1) {
147 //value >>= 1;
148 bit += 1;
149 }
150
151 return bit;
152 }
153
154 static int32_t
155 getPairIndex(UChar32 ch)
156 {
157 int32_t pairedCharCount = UPRV_LENGTHOF(pairedChars);
158 int32_t pairedCharPower = 1 << highBit(pairedCharCount);
159 int32_t pairedCharExtra = pairedCharCount - pairedCharPower;
160
161 int32_t probe = pairedCharPower;
162 int32_t pairIndex = 0;
163
164 if (ch >= pairedChars[pairedCharExtra]) {
165 pairIndex = pairedCharExtra;
166 }
167
168 while (probe > (1 << 0)) {
169 probe >>= 1;
170
171 if (ch >= pairedChars[pairIndex + probe]) {
172 pairIndex += probe;
173 }
174 }
175
176 if (pairedChars[pairIndex] != ch) {
177 pairIndex = -1;
178 }
179
180 return pairIndex;
181 }
182
183 static UBool
184 sameScript(UScriptCode scriptOne, UScriptCode scriptTwo)
185 {
186 return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
187 }
188
189 U_CAPI UScriptRun * U_EXPORT2
190 uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode)
191 {
192 UScriptRun *result = NULL;
193
194 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
195 return NULL;
196 }
197
198 result = uprv_malloc(sizeof (UScriptRun));
199
200 if (result == NULL) {
201 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
202 return NULL;
203 }
204
205 uscript_setRunText(result, src, length, pErrorCode);
206
207 /* Release the UScriptRun if uscript_setRunText() returns an error */
208 if (U_FAILURE(*pErrorCode)) {
209 uprv_free(result);
210 result = NULL;
211 }
212
213 return result;
214 }
215
216 U_CAPI void U_EXPORT2
217 uscript_closeRun(UScriptRun *scriptRun)
218 {
219 if (scriptRun != NULL) {
220 uprv_free(scriptRun);
221 }
222 }
223
224 U_CAPI void U_EXPORT2
225 uscript_resetRun(UScriptRun *scriptRun)
226 {
227 if (scriptRun != NULL) {
228 scriptRun->scriptStart = 0;
229 scriptRun->scriptLimit = 0;
230 scriptRun->scriptCode = USCRIPT_INVALID_CODE;
231 scriptRun->parenSP = -1;
232 scriptRun->pushCount = 0;
233 scriptRun->fixupCount = 0;
234 }
235 }
236
237 U_CAPI void U_EXPORT2
238 uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode)
239 {
240 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
241 return;
242 }
243
244 if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) {
245 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
246 return;
247 }
248
249 scriptRun->textArray = src;
250 scriptRun->textLength = length;
251
252 uscript_resetRun(scriptRun);
253 }
254
255 U_CAPI UBool U_EXPORT2
256 uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript)
257 {
258 UErrorCode error = U_ZERO_ERROR;
259
260 /* if we've fallen off the end of the text, we're done */
261 if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) {
262 return FALSE;
263 }
264
265 SYNC_FIXUP(scriptRun);
266 scriptRun->scriptCode = USCRIPT_COMMON;
267
268 for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) {
269 UChar high = scriptRun->textArray[scriptRun->scriptLimit];
270 UChar32 ch = high;
271 UScriptCode sc;
272 int32_t pairIndex;
273
274 /*
275 * if the character is a high surrogate and it's not the last one
276 * in the text, see if it's followed by a low surrogate
277 */
278 if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) {
279 UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1];
280
281 /*
282 * if it is followed by a low surrogate,
283 * consume it and form the full character
284 */
285 if (low >= 0xDC00 && low <= 0xDFFF) {
286 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
287 scriptRun->scriptLimit += 1;
288 }
289 }
290
291 sc = uscript_getScript(ch, &error);
292 pairIndex = getPairIndex(ch);
293
294 /*
295 * Paired character handling:
296 *
297 * if it's an open character, push it onto the stack.
298 * if it's a close character, find the matching open on the
299 * stack, and use that script code. Any non-matching open
300 * characters above it on the stack will be poped.
301 */
302 if (pairIndex >= 0) {
303 if ((pairIndex & 1) == 0) {
304 push(scriptRun, pairIndex, scriptRun->scriptCode);
305 } else {
306 int32_t pi = pairIndex & ~1;
307
308 while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) {
309 pop(scriptRun);
310 }
311
312 if (STACK_IS_NOT_EMPTY(scriptRun)) {
313 sc = TOP(scriptRun).scriptCode;
314 }
315 }
316 }
317
318 if (sameScript(scriptRun->scriptCode, sc)) {
319 if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
320 scriptRun->scriptCode = sc;
321
322 fixup(scriptRun, scriptRun->scriptCode);
323 }
324
325 /*
326 * if this character is a close paired character,
327 * pop the matching open character from the stack
328 */
329 if (pairIndex >= 0 && (pairIndex & 1) != 0) {
330 pop(scriptRun);
331 }
332 } else {
333 /*
334 * if the run broke on a surrogate pair,
335 * end it before the high surrogate
336 */
337 if (ch >= 0x10000) {
338 scriptRun->scriptLimit -= 1;
339 }
340
341 break;
342 }
343 }
344
345
346 if (pRunStart != NULL) {
347 *pRunStart = scriptRun->scriptStart;
348 }
349
350 if (pRunLimit != NULL) {
351 *pRunLimit = scriptRun->scriptLimit;
352 }
353
354 if (pRunScript != NULL) {
355 *pRunScript = scriptRun->scriptCode;
356 }
357
358 return TRUE;
359 }