]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
729e4ab9 | 4 | * Copyright (C) 2001-2010, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: ucol_tok.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created 02/22/2001 | |
14 | * created by: Vladimir Weinstein | |
15 | * | |
374ca955 | 16 | * This module reads a tailoring rule string and produces a list of |
b75a7d8f | 17 | * tokens that will be turned into collation elements |
374ca955 | 18 | * |
b75a7d8f A |
19 | */ |
20 | ||
21 | #include "unicode/utypes.h" | |
22 | ||
23 | #if !UCONFIG_NO_COLLATION | |
24 | ||
729e4ab9 | 25 | #include "unicode/uscript.h" |
b75a7d8f A |
26 | #include "unicode/ustring.h" |
27 | #include "unicode/uchar.h" | |
28 | #include "unicode/uniset.h" | |
374ca955 | 29 | |
b75a7d8f | 30 | #include "cmemory.h" |
729e4ab9 A |
31 | #include "cstring.h" |
32 | #include "ucol_bld.h" | |
33 | #include "ucol_tok.h" | |
34 | #include "ulocimp.h" | |
35 | #include "uresimp.h" | |
374ca955 | 36 | #include "util.h" |
b75a7d8f | 37 | |
729e4ab9 A |
38 | // Define this only for debugging. |
39 | // #define DEBUG_FOR_COLL_RULES 1 | |
40 | ||
41 | #ifdef DEBUG_FOR_COLL_RULES | |
42 | #include <iostream> | |
43 | #endif | |
44 | ||
45 | U_NAMESPACE_USE | |
46 | ||
b75a7d8f | 47 | U_CDECL_BEGIN |
73c04bcf | 48 | static int32_t U_CALLCONV |
b75a7d8f A |
49 | uhash_hashTokens(const UHashTok k) |
50 | { | |
51 | int32_t hash = 0; | |
52 | //uint32_t key = (uint32_t)k.integer; | |
53 | UColToken *key = (UColToken *)k.pointer; | |
54 | if (key != 0) { | |
b75a7d8f A |
55 | int32_t len = (key->source & 0xFF000000)>>24; |
56 | int32_t inc = ((len - 32) / 32) + 1; | |
374ca955 | 57 | |
729e4ab9 | 58 | const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl); |
374ca955 | 59 | const UChar *limit = p + len; |
b75a7d8f A |
60 | |
61 | while (p<limit) { | |
62 | hash = (hash * 37) + *p; | |
63 | p += inc; | |
64 | } | |
65 | } | |
66 | return hash; | |
67 | } | |
68 | ||
73c04bcf | 69 | static UBool U_CALLCONV |
b75a7d8f A |
70 | uhash_compareTokens(const UHashTok key1, const UHashTok key2) |
71 | { | |
72 | //uint32_t p1 = (uint32_t) key1.integer; | |
73 | //uint32_t p2 = (uint32_t) key2.integer; | |
74 | UColToken *p1 = (UColToken *)key1.pointer; | |
75 | UColToken *p2 = (UColToken *)key2.pointer; | |
729e4ab9 A |
76 | const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl); |
77 | const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl); | |
b75a7d8f A |
78 | uint32_t s1L = ((p1->source & 0xFF000000) >> 24); |
79 | uint32_t s2L = ((p2->source & 0xFF000000) >> 24); | |
80 | const UChar *end = s1+s1L-1; | |
81 | ||
82 | if (p1 == p2) { | |
83 | return TRUE; | |
84 | } | |
85 | if (p1->source == 0 || p2->source == 0) { | |
86 | return FALSE; | |
87 | } | |
88 | if(s1L != s2L) { | |
46f4442e | 89 | return FALSE; |
b75a7d8f A |
90 | } |
91 | if(p1->source == p2->source) { | |
46f4442e | 92 | return TRUE; |
b75a7d8f A |
93 | } |
94 | while((s1 < end) && *s1 == *s2) { | |
46f4442e A |
95 | ++s1; |
96 | ++s2; | |
b75a7d8f A |
97 | } |
98 | if(*s1 == *s2) { | |
46f4442e | 99 | return TRUE; |
b75a7d8f | 100 | } else { |
46f4442e | 101 | return FALSE; |
b75a7d8f A |
102 | } |
103 | } | |
104 | U_CDECL_END | |
105 | ||
729e4ab9 A |
106 | /* |
107 | * Debug messages used to pinpoint where a format error occurred. | |
108 | * A better way is to include context-sensitive information in syntaxError() function. | |
109 | * | |
110 | * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR | |
111 | * in the compile line. | |
112 | */ | |
113 | /* #define DEBUG_FOR_FORMAT_ERROR 1 */ | |
114 | ||
115 | #ifdef DEBUG_FOR_FORMAT_ERROR | |
116 | #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);} | |
117 | #else | |
118 | #define DBG_FORMAT_ERROR | |
119 | #endif | |
120 | ||
121 | ||
122 | /* | |
123 | * Controls debug messages so that the output can be compared before and after a | |
124 | * big change. Prints the information of every code point that comes out of the | |
125 | * collation parser and its strength into a file. When a big change in format | |
126 | * happens, the files before and after the change should be identical. | |
127 | * | |
128 | * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS | |
129 | * in the compile line. | |
130 | */ | |
131 | // #define DEBUG_FOR_CODE_POINTS 1 | |
132 | ||
133 | #ifdef DEBUG_FOR_CODE_POINTS | |
134 | FILE* dfcp_fp = NULL; | |
135 | #endif | |
136 | ||
137 | ||
46f4442e | 138 | /*static inline void U_CALLCONV |
b75a7d8f | 139 | uhash_freeBlockWrapper(void *obj) { |
46f4442e A |
140 | uhash_freeBlock(obj); |
141 | }*/ | |
b75a7d8f A |
142 | |
143 | ||
144 | typedef struct { | |
46f4442e A |
145 | uint32_t startCE; |
146 | uint32_t startContCE; | |
147 | uint32_t limitCE; | |
148 | uint32_t limitContCE; | |
b75a7d8f A |
149 | } indirectBoundaries; |
150 | ||
151 | /* these values are used for finding CE values for indirect positioning. */ | |
152 | /* Indirect positioning is a mechanism for allowing resets on symbolic */ | |
153 | /* values. It only works for resets and you cannot tailor indirect names */ | |
154 | /* An indirect name can define either an anchor point or a range. An */ | |
155 | /* anchor point behaves in exactly the same way as a code point in reset */ | |
156 | /* would, except that it cannot be tailored. A range (we currently only */ | |
157 | /* know for the [top] range will explicitly set the upper bound for */ | |
158 | /* generated CEs, thus allowing for better control over how many CEs can */ | |
159 | /* be squeezed between in the range without performance penalty. */ | |
160 | /* In that respect, we use [top] for tailoring of locales that use CJK */ | |
161 | /* characters. Other indirect values are currently a pure convenience, */ | |
162 | /* they can be used to assure that the CEs will be always positioned in */ | |
163 | /* the same place relative to a point with known properties (e.g. first */ | |
164 | /* primary ignorable). */ | |
165 | static indirectBoundaries ucolIndirectBoundaries[15]; | |
166 | /* | |
167 | static indirectBoundaries ucolIndirectBoundaries[11] = { | |
46f4442e A |
168 | { UCOL_RESET_TOP_VALUE, 0, |
169 | UCOL_NEXT_TOP_VALUE, 0 }, | |
170 | { UCOL_FIRST_PRIMARY_IGNORABLE, 0, | |
171 | 0, 0 }, | |
172 | { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, | |
173 | 0, 0 }, | |
174 | { UCOL_FIRST_SECONDARY_IGNORABLE, 0, | |
175 | 0, 0 }, | |
176 | { UCOL_LAST_SECONDARY_IGNORABLE, 0, | |
177 | 0, 0 }, | |
178 | { UCOL_FIRST_TERTIARY_IGNORABLE, 0, | |
179 | 0, 0 }, | |
180 | { UCOL_LAST_TERTIARY_IGNORABLE, 0, | |
181 | 0, 0 }, | |
182 | { UCOL_FIRST_VARIABLE, 0, | |
183 | 0, 0 }, | |
184 | { UCOL_LAST_VARIABLE, 0, | |
185 | 0, 0 }, | |
186 | { UCOL_FIRST_NON_VARIABLE, 0, | |
187 | 0, 0 }, | |
188 | { UCOL_LAST_NON_VARIABLE, 0, | |
189 | 0, 0 }, | |
b75a7d8f A |
190 | }; |
191 | */ | |
192 | ||
374ca955 A |
193 | static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) { |
194 | ||
46f4442e A |
195 | // Set values for the top - TODO: once we have values for all the indirects, we are going |
196 | // to initalize here. | |
197 | ucolIndirectBoundaries[indexR].startCE = start[0]; | |
198 | ucolIndirectBoundaries[indexR].startContCE = start[1]; | |
199 | if(end) { | |
200 | ucolIndirectBoundaries[indexR].limitCE = end[0]; | |
201 | ucolIndirectBoundaries[indexR].limitContCE = end[1]; | |
202 | } else { | |
203 | ucolIndirectBoundaries[indexR].limitCE = 0; | |
204 | ucolIndirectBoundaries[indexR].limitContCE = 0; | |
205 | } | |
b75a7d8f A |
206 | } |
207 | ||
208 | ||
374ca955 A |
209 | static inline |
210 | void syntaxError(const UChar* rules, | |
b75a7d8f A |
211 | int32_t pos, |
212 | int32_t rulesLen, | |
46f4442e A |
213 | UParseError* parseError) |
214 | { | |
b75a7d8f A |
215 | parseError->offset = pos; |
216 | parseError->line = 0 ; /* we are not using line numbers */ | |
374ca955 | 217 | |
b75a7d8f | 218 | // for pre-context |
46f4442e | 219 | int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); |
b75a7d8f | 220 | int32_t stop = pos; |
374ca955 | 221 | |
b75a7d8f A |
222 | u_memcpy(parseError->preContext,rules+start,stop-start); |
223 | //null terminate the buffer | |
224 | parseError->preContext[stop-start] = 0; | |
374ca955 | 225 | |
b75a7d8f A |
226 | //for post-context |
227 | start = pos+1; | |
374ca955 | 228 | stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : |
46f4442e | 229 | rulesLen; |
b75a7d8f | 230 | |
374ca955 | 231 | if(start < stop) { |
46f4442e A |
232 | u_memcpy(parseError->postContext,rules+start,stop-start); |
233 | //null terminate the buffer | |
234 | parseError->postContext[stop-start]= 0; | |
374ca955 | 235 | } else { |
46f4442e | 236 | parseError->postContext[0] = 0; |
374ca955 | 237 | } |
b75a7d8f A |
238 | } |
239 | ||
240 | static | |
241 | void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { | |
46f4442e A |
242 | switch(attrib) { |
243 | case UCOL_HIRAGANA_QUATERNARY_MODE: | |
244 | opts->hiraganaQ = value; | |
245 | break; | |
246 | case UCOL_FRENCH_COLLATION: | |
247 | opts->frenchCollation = value; | |
248 | break; | |
249 | case UCOL_ALTERNATE_HANDLING: | |
250 | opts->alternateHandling = value; | |
251 | break; | |
252 | case UCOL_CASE_FIRST: | |
253 | opts->caseFirst = value; | |
254 | break; | |
255 | case UCOL_CASE_LEVEL: | |
256 | opts->caseLevel = value; | |
257 | break; | |
258 | case UCOL_NORMALIZATION_MODE: | |
259 | opts->normalizationMode = value; | |
260 | break; | |
261 | case UCOL_STRENGTH: | |
262 | opts->strength = value; | |
263 | break; | |
264 | case UCOL_NUMERIC_COLLATION: | |
265 | opts->numericCollation = value; | |
266 | break; | |
267 | case UCOL_ATTRIBUTE_COUNT: | |
268 | default: | |
269 | break; | |
270 | } | |
b75a7d8f A |
271 | } |
272 | ||
729e4ab9 | 273 | #define UTOK_OPTION_COUNT 22 |
b75a7d8f A |
274 | |
275 | static UBool didInit = FALSE; | |
276 | /* we can be strict, or we can be lenient */ | |
277 | /* I'd surely be lenient with the option arguments */ | |
278 | /* maybe even with options */ | |
279 | U_STRING_DECL(suboption_00, "non-ignorable", 13); | |
280 | U_STRING_DECL(suboption_01, "shifted", 7); | |
281 | ||
282 | U_STRING_DECL(suboption_02, "lower", 5); | |
283 | U_STRING_DECL(suboption_03, "upper", 5); | |
284 | U_STRING_DECL(suboption_04, "off", 3); | |
285 | U_STRING_DECL(suboption_05, "on", 2); | |
286 | U_STRING_DECL(suboption_06, "1", 1); | |
287 | U_STRING_DECL(suboption_07, "2", 1); | |
288 | U_STRING_DECL(suboption_08, "3", 1); | |
289 | U_STRING_DECL(suboption_09, "4", 1); | |
290 | U_STRING_DECL(suboption_10, "I", 1); | |
291 | ||
292 | U_STRING_DECL(suboption_11, "primary", 7); | |
293 | U_STRING_DECL(suboption_12, "secondary", 9); | |
294 | U_STRING_DECL(suboption_13, "tertiary", 8); | |
295 | U_STRING_DECL(suboption_14, "variable", 8); | |
296 | U_STRING_DECL(suboption_15, "regular", 7); | |
297 | U_STRING_DECL(suboption_16, "implicit", 8); | |
298 | U_STRING_DECL(suboption_17, "trailing", 8); | |
299 | ||
300 | ||
301 | U_STRING_DECL(option_00, "undefined", 9); | |
374ca955 | 302 | U_STRING_DECL(option_01, "rearrange", 9); |
b75a7d8f | 303 | U_STRING_DECL(option_02, "alternate", 9); |
374ca955 A |
304 | U_STRING_DECL(option_03, "backwards", 9); |
305 | U_STRING_DECL(option_04, "variable top", 12); | |
306 | U_STRING_DECL(option_05, "top", 3); | |
307 | U_STRING_DECL(option_06, "normalization", 13); | |
308 | U_STRING_DECL(option_07, "caseLevel", 9); | |
309 | U_STRING_DECL(option_08, "caseFirst", 9); | |
310 | U_STRING_DECL(option_09, "scriptOrder", 11); | |
311 | U_STRING_DECL(option_10, "charsetname", 11); | |
312 | U_STRING_DECL(option_11, "charset", 7); | |
313 | U_STRING_DECL(option_12, "before", 6); | |
b75a7d8f A |
314 | U_STRING_DECL(option_13, "hiraganaQ", 9); |
315 | U_STRING_DECL(option_14, "strength", 8); | |
316 | U_STRING_DECL(option_15, "first", 5); | |
317 | U_STRING_DECL(option_16, "last", 4); | |
318 | U_STRING_DECL(option_17, "optimize", 8); | |
319 | U_STRING_DECL(option_18, "suppressContractions", 20); | |
374ca955 | 320 | U_STRING_DECL(option_19, "numericOrdering", 15); |
729e4ab9 A |
321 | U_STRING_DECL(option_20, "import", 6); |
322 | U_STRING_DECL(option_21, "reorder", 7); | |
b75a7d8f A |
323 | |
324 | /* | |
374ca955 A |
325 | [last variable] last variable value |
326 | [last primary ignorable] largest CE for primary ignorable | |
327 | [last secondary ignorable] largest CE for secondary ignorable | |
328 | [last tertiary ignorable] largest CE for tertiary ignorable | |
329 | [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) | |
b75a7d8f A |
330 | */ |
331 | ||
332 | ||
333 | static const ucolTokSuboption alternateSub[2] = { | |
46f4442e A |
334 | {suboption_00, 13, UCOL_NON_IGNORABLE}, |
335 | {suboption_01, 7, UCOL_SHIFTED} | |
b75a7d8f A |
336 | }; |
337 | ||
338 | static const ucolTokSuboption caseFirstSub[3] = { | |
46f4442e A |
339 | {suboption_02, 5, UCOL_LOWER_FIRST}, |
340 | {suboption_03, 5, UCOL_UPPER_FIRST}, | |
341 | {suboption_04, 3, UCOL_OFF}, | |
b75a7d8f A |
342 | }; |
343 | ||
344 | static const ucolTokSuboption onOffSub[2] = { | |
46f4442e A |
345 | {suboption_04, 3, UCOL_OFF}, |
346 | {suboption_05, 2, UCOL_ON} | |
b75a7d8f A |
347 | }; |
348 | ||
349 | static const ucolTokSuboption frenchSub[1] = { | |
46f4442e | 350 | {suboption_07, 1, UCOL_ON} |
b75a7d8f A |
351 | }; |
352 | ||
353 | static const ucolTokSuboption beforeSub[3] = { | |
46f4442e A |
354 | {suboption_06, 1, UCOL_PRIMARY}, |
355 | {suboption_07, 1, UCOL_SECONDARY}, | |
356 | {suboption_08, 1, UCOL_TERTIARY} | |
b75a7d8f A |
357 | }; |
358 | ||
359 | static const ucolTokSuboption strengthSub[5] = { | |
46f4442e A |
360 | {suboption_06, 1, UCOL_PRIMARY}, |
361 | {suboption_07, 1, UCOL_SECONDARY}, | |
362 | {suboption_08, 1, UCOL_TERTIARY}, | |
363 | {suboption_09, 1, UCOL_QUATERNARY}, | |
364 | {suboption_10, 1, UCOL_IDENTICAL}, | |
b75a7d8f A |
365 | }; |
366 | ||
367 | static const ucolTokSuboption firstLastSub[7] = { | |
46f4442e A |
368 | {suboption_11, 7, UCOL_PRIMARY}, |
369 | {suboption_12, 9, UCOL_PRIMARY}, | |
370 | {suboption_13, 8, UCOL_PRIMARY}, | |
371 | {suboption_14, 8, UCOL_PRIMARY}, | |
372 | {suboption_15, 7, UCOL_PRIMARY}, | |
373 | {suboption_16, 8, UCOL_PRIMARY}, | |
374 | {suboption_17, 8, UCOL_PRIMARY}, | |
b75a7d8f A |
375 | }; |
376 | ||
377 | enum OptionNumber { | |
46f4442e | 378 | OPTION_ALTERNATE_HANDLING = 0, |
b75a7d8f A |
379 | OPTION_FRENCH_COLLATION, |
380 | OPTION_CASE_LEVEL, | |
381 | OPTION_CASE_FIRST, | |
382 | OPTION_NORMALIZATION_MODE, | |
383 | OPTION_HIRAGANA_QUATERNARY, | |
384 | OPTION_STRENGTH, | |
385 | OPTION_NUMERIC_COLLATION, | |
386 | OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, | |
387 | OPTION_VARIABLE_TOP, | |
388 | OPTION_REARRANGE, | |
389 | OPTION_BEFORE, | |
390 | OPTION_TOP, | |
391 | OPTION_FIRST, | |
392 | OPTION_LAST, | |
393 | OPTION_OPTIMIZE, | |
394 | OPTION_SUPPRESS_CONTRACTIONS, | |
395 | OPTION_UNDEFINED, | |
396 | OPTION_SCRIPT_ORDER, | |
397 | OPTION_CHARSET_NAME, | |
729e4ab9 A |
398 | OPTION_CHARSET, |
399 | OPTION_IMPORT, | |
400 | OPTION_SCRIPTREORDER | |
b75a7d8f A |
401 | } ; |
402 | ||
403 | static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { | |
46f4442e A |
404 | /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ |
405 | /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ | |
406 | /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ | |
407 | /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ | |
408 | /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ | |
409 | /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */ | |
410 | /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ | |
411 | /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/ | |
412 | /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ | |
413 | /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ | |
414 | /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ | |
415 | /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ | |
416 | /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ | |
417 | /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ | |
418 | /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ | |
419 | /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */ | |
420 | /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ | |
421 | /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ | |
422 | /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ | |
729e4ab9 A |
423 | /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */ |
424 | /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */ | |
425 | /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */ | |
b75a7d8f A |
426 | }; |
427 | ||
428 | static | |
374ca955 | 429 | int32_t u_strncmpNoCase(const UChar *s1, |
46f4442e A |
430 | const UChar *s2, |
431 | int32_t n) | |
b75a7d8f A |
432 | { |
433 | if(n > 0) { | |
434 | int32_t rc; | |
435 | for(;;) { | |
436 | rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); | |
437 | if(rc != 0 || *s1 == 0 || --n == 0) { | |
438 | return rc; | |
439 | } | |
440 | ++s1; | |
441 | ++s2; | |
442 | } | |
443 | } | |
444 | return 0; | |
445 | } | |
446 | ||
447 | static | |
448 | void ucol_uprv_tok_initData() { | |
46f4442e A |
449 | if(!didInit) { |
450 | U_STRING_INIT(suboption_00, "non-ignorable", 13); | |
451 | U_STRING_INIT(suboption_01, "shifted", 7); | |
452 | ||
453 | U_STRING_INIT(suboption_02, "lower", 5); | |
454 | U_STRING_INIT(suboption_03, "upper", 5); | |
455 | U_STRING_INIT(suboption_04, "off", 3); | |
456 | U_STRING_INIT(suboption_05, "on", 2); | |
457 | ||
458 | U_STRING_INIT(suboption_06, "1", 1); | |
459 | U_STRING_INIT(suboption_07, "2", 1); | |
460 | U_STRING_INIT(suboption_08, "3", 1); | |
461 | U_STRING_INIT(suboption_09, "4", 1); | |
462 | U_STRING_INIT(suboption_10, "I", 1); | |
463 | ||
464 | U_STRING_INIT(suboption_11, "primary", 7); | |
465 | U_STRING_INIT(suboption_12, "secondary", 9); | |
466 | U_STRING_INIT(suboption_13, "tertiary", 8); | |
467 | U_STRING_INIT(suboption_14, "variable", 8); | |
468 | U_STRING_INIT(suboption_15, "regular", 7); | |
469 | U_STRING_INIT(suboption_16, "implicit", 8); | |
470 | U_STRING_INIT(suboption_17, "trailing", 8); | |
471 | ||
472 | ||
473 | U_STRING_INIT(option_00, "undefined", 9); | |
474 | U_STRING_INIT(option_01, "rearrange", 9); | |
475 | U_STRING_INIT(option_02, "alternate", 9); | |
476 | U_STRING_INIT(option_03, "backwards", 9); | |
477 | U_STRING_INIT(option_04, "variable top", 12); | |
478 | U_STRING_INIT(option_05, "top", 3); | |
479 | U_STRING_INIT(option_06, "normalization", 13); | |
480 | U_STRING_INIT(option_07, "caseLevel", 9); | |
481 | U_STRING_INIT(option_08, "caseFirst", 9); | |
482 | U_STRING_INIT(option_09, "scriptOrder", 11); | |
483 | U_STRING_INIT(option_10, "charsetname", 11); | |
484 | U_STRING_INIT(option_11, "charset", 7); | |
485 | U_STRING_INIT(option_12, "before", 6); | |
486 | U_STRING_INIT(option_13, "hiraganaQ", 9); | |
487 | U_STRING_INIT(option_14, "strength", 8); | |
488 | U_STRING_INIT(option_15, "first", 5); | |
489 | U_STRING_INIT(option_16, "last", 4); | |
490 | U_STRING_INIT(option_17, "optimize", 8); | |
491 | U_STRING_INIT(option_18, "suppressContractions", 20); | |
492 | U_STRING_INIT(option_19, "numericOrdering", 15); | |
729e4ab9 A |
493 | U_STRING_INIT(option_20, "import ", 6); |
494 | U_STRING_INIT(option_21, "reorder", 7); | |
46f4442e A |
495 | didInit = TRUE; |
496 | } | |
b75a7d8f A |
497 | } |
498 | ||
499 | ||
500 | // This function reads basic options to set in the runtime collator | |
501 | // used by data driven tests. Should not support build time options | |
502 | U_CAPI const UChar * U_EXPORT2 | |
374ca955 | 503 | ucol_tok_getNextArgument(const UChar *start, const UChar *end, |
46f4442e A |
504 | UColAttribute *attrib, UColAttributeValue *value, |
505 | UErrorCode *status) | |
506 | { | |
507 | uint32_t i = 0; | |
508 | int32_t j=0; | |
509 | UBool foundOption = FALSE; | |
510 | const UChar *optionArg = NULL; | |
511 | ||
512 | ucol_uprv_tok_initData(); | |
513 | ||
729e4ab9 | 514 | while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start))) { /* eat whitespace */ |
46f4442e A |
515 | start++; |
516 | } | |
517 | if(start >= end) { | |
518 | return NULL; | |
519 | } | |
520 | /* skip opening '[' */ | |
521 | if(*start == 0x005b) { | |
522 | start++; | |
523 | } else { | |
524 | *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' | |
525 | return NULL; | |
b75a7d8f | 526 | } |
b75a7d8f | 527 | |
46f4442e A |
528 | while(i < UTOK_OPTION_COUNT) { |
529 | if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { | |
530 | foundOption = TRUE; | |
531 | if(end - start > rulesOptions[i].optionLen) { | |
532 | optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ | |
729e4ab9 | 533 | while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */ |
46f4442e A |
534 | optionArg++; |
535 | } | |
536 | } | |
537 | break; | |
b75a7d8f | 538 | } |
46f4442e A |
539 | i++; |
540 | } | |
541 | ||
542 | if(!foundOption) { | |
543 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
544 | return NULL; | |
545 | } | |
546 | ||
547 | if(optionArg) { | |
548 | for(j = 0; j<rulesOptions[i].subSize; j++) { | |
549 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { | |
550 | //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); | |
551 | *attrib = rulesOptions[i].attr; | |
552 | *value = rulesOptions[i].subopts[j].attrVal; | |
553 | optionArg += rulesOptions[i].subopts[j].subLen; | |
729e4ab9 | 554 | while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */ |
46f4442e A |
555 | optionArg++; |
556 | } | |
557 | if(*optionArg == 0x005d) { | |
558 | optionArg++; | |
559 | return optionArg; | |
560 | } else { | |
561 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
562 | return NULL; | |
563 | } | |
564 | } | |
b75a7d8f | 565 | } |
b75a7d8f | 566 | } |
46f4442e A |
567 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
568 | return NULL; | |
b75a7d8f A |
569 | } |
570 | ||
374ca955 | 571 | static |
b75a7d8f | 572 | USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) { |
46f4442e A |
573 | while(*start != 0x005b) { /* advance while we find the first '[' */ |
574 | start++; | |
575 | } | |
576 | // now we need to get a balanced set of '[]'. The problem is that a set can have | |
577 | // many, and *end point to the first closing '[' | |
578 | int32_t noOpenBraces = 1; | |
579 | int32_t current = 1; // skip the opening brace | |
580 | while(start+current < end && noOpenBraces != 0) { | |
581 | if(start[current] == 0x005b) { | |
582 | noOpenBraces++; | |
583 | } else if(start[current] == 0x005D) { // closing brace | |
584 | noOpenBraces--; | |
585 | } | |
586 | current++; | |
b75a7d8f | 587 | } |
b75a7d8f | 588 | |
46f4442e A |
589 | if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { |
590 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
591 | return NULL; | |
592 | } | |
593 | return uset_openPattern(start, current, status); | |
b75a7d8f A |
594 | } |
595 | ||
729e4ab9 A |
596 | /** |
597 | * Reads an option and matches the option name with the predefined options. (Case-insensitive.) | |
598 | * @param start Pointer to the start UChar. | |
599 | * @param end Pointer to the last valid pointer beyond which the option will not extend. | |
600 | * @param optionArg Address of the pointer at which the options start (after the option name) | |
601 | * @return The index of the option, or -1 if the option is not valid. | |
602 | */ | |
b75a7d8f A |
603 | static |
604 | int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) { | |
46f4442e A |
605 | int32_t i = 0; |
606 | ucol_uprv_tok_initData(); | |
607 | ||
729e4ab9 | 608 | while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whitespace */ |
46f4442e A |
609 | start++; |
610 | } | |
611 | while(i < UTOK_OPTION_COUNT) { | |
612 | if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { | |
613 | if(end - start > rulesOptions[i].optionLen) { | |
729e4ab9 A |
614 | *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */ |
615 | while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**optionArg)) { /* eat whitespace */ | |
46f4442e A |
616 | (*optionArg)++; |
617 | } | |
618 | } | |
619 | break; | |
b75a7d8f | 620 | } |
46f4442e A |
621 | i++; |
622 | } | |
623 | if(i == UTOK_OPTION_COUNT) { | |
624 | i = -1; // didn't find an option | |
b75a7d8f | 625 | } |
46f4442e | 626 | return i; |
b75a7d8f A |
627 | } |
628 | ||
629 | ||
729e4ab9 A |
630 | static |
631 | void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) { | |
632 | int32_t codeCount = 0; | |
633 | int32_t codeIndex = 0; | |
634 | char conversion[64]; | |
635 | int32_t tokenLength = 0; | |
636 | const UChar* space; | |
637 | ||
638 | const UChar* current = src->current; | |
639 | const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); | |
640 | ||
641 | // eat leading whitespace | |
642 | while(current < end && u_isWhitespace(*current)) { | |
643 | current++; | |
644 | } | |
645 | ||
646 | while(current < end) { | |
647 | space = u_memchr(current, 0x0020, end - current); | |
648 | space = space == 0 ? end : space; | |
649 | tokenLength = space - current; | |
650 | if (tokenLength < 4) { | |
651 | *status = U_INVALID_FORMAT_ERROR; | |
652 | return; | |
653 | } | |
654 | codeCount++; | |
655 | current += tokenLength; | |
656 | while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ | |
657 | ++current; | |
658 | } | |
659 | } | |
660 | ||
661 | if (codeCount == 0) { | |
662 | *status = U_INVALID_FORMAT_ERROR; | |
663 | } | |
664 | ||
665 | src->reorderCodesLength = codeCount; | |
666 | src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); | |
667 | current = src->current; | |
668 | ||
669 | // eat leading whitespace | |
670 | while(current < end && u_isWhitespace(*current)) { | |
671 | current++; | |
672 | } | |
673 | ||
674 | while(current < end) { | |
675 | space = u_memchr(current, 0x0020, end - current); | |
676 | space = space == 0 ? end : space; | |
677 | tokenLength = space - current; | |
678 | if (tokenLength < 4) { | |
679 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
680 | return; | |
681 | } else { | |
682 | u_UCharsToChars(current, conversion, tokenLength); | |
683 | conversion[tokenLength] = '\0'; | |
684 | src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion); | |
685 | if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { | |
686 | src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion); | |
687 | } | |
688 | if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { | |
689 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
690 | } | |
691 | } | |
692 | codeIndex++; | |
693 | current += tokenLength; | |
694 | while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ | |
695 | ++current; | |
696 | } | |
697 | } | |
698 | } | |
699 | ||
b75a7d8f A |
700 | // reads and conforms to various options in rules |
701 | // end is the position of the first closing ']' | |
702 | // However, some of the options take an UnicodeSet definition | |
703 | // which needs to duplicate the closing ']' | |
704 | // for example: '[copy [\uAC00-\uD7FF]]' | |
374ca955 | 705 | // These options will move end to the second ']' and the |
b75a7d8f A |
706 | // caller will set the current to it. |
707 | static | |
708 | uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) { | |
46f4442e A |
709 | const UChar* start = src->current; |
710 | int32_t i = 0; | |
711 | int32_t j=0; | |
712 | const UChar *optionArg = NULL; | |
b75a7d8f | 713 | |
46f4442e | 714 | uint8_t result = 0; |
b75a7d8f | 715 | |
46f4442e A |
716 | start++; /*skip opening '['*/ |
717 | i = ucol_uprv_tok_readOption(start, src->end, &optionArg); | |
718 | if(optionArg) { | |
719 | src->current = optionArg; | |
720 | } | |
b75a7d8f | 721 | |
46f4442e A |
722 | if(i < 0) { |
723 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
724 | } else { | |
725 | int32_t noOpenBraces = 1; | |
726 | switch(i) { | |
b75a7d8f A |
727 | case OPTION_ALTERNATE_HANDLING: |
728 | case OPTION_FRENCH_COLLATION: | |
729 | case OPTION_CASE_LEVEL: | |
730 | case OPTION_CASE_FIRST: | |
731 | case OPTION_NORMALIZATION_MODE: | |
732 | case OPTION_HIRAGANA_QUATERNARY: | |
733 | case OPTION_STRENGTH: | |
734 | case OPTION_NUMERIC_COLLATION: | |
46f4442e A |
735 | if(optionArg) { |
736 | for(j = 0; j<rulesOptions[i].subSize; j++) { | |
737 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { | |
738 | ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); | |
739 | result = UCOL_TOK_SUCCESS; | |
740 | } | |
741 | } | |
b75a7d8f | 742 | } |
46f4442e A |
743 | if(result == 0) { |
744 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
745 | } | |
746 | break; | |
b75a7d8f | 747 | case OPTION_VARIABLE_TOP: |
46f4442e A |
748 | result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; |
749 | break; | |
b75a7d8f | 750 | case OPTION_REARRANGE: |
46f4442e A |
751 | result = UCOL_TOK_SUCCESS; |
752 | break; | |
b75a7d8f | 753 | case OPTION_BEFORE: |
46f4442e A |
754 | if(optionArg) { |
755 | for(j = 0; j<rulesOptions[i].subSize; j++) { | |
756 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { | |
729e4ab9 | 757 | result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1); |
46f4442e A |
758 | } |
759 | } | |
b75a7d8f | 760 | } |
46f4442e A |
761 | if(result == 0) { |
762 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
763 | } | |
764 | break; | |
b75a7d8f | 765 | case OPTION_TOP: /* we are going to have an array with structures of limit CEs */ |
46f4442e A |
766 | /* index to this array will be src->parsedToken.indirectIndex*/ |
767 | src->parsedToken.indirectIndex = 0; | |
768 | result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; | |
769 | break; | |
b75a7d8f A |
770 | case OPTION_FIRST: |
771 | case OPTION_LAST: /* first, last */ | |
46f4442e A |
772 | for(j = 0; j<rulesOptions[i].subSize; j++) { |
773 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { | |
774 | // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first | |
775 | // element of indirect boundaries is reserved for top. | |
776 | src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2); | |
777 | result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; | |
778 | } | |
b75a7d8f | 779 | } |
46f4442e A |
780 | if(result == 0) { |
781 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
782 | } | |
783 | break; | |
b75a7d8f A |
784 | case OPTION_OPTIMIZE: |
785 | case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization | |
46f4442e A |
786 | // we need to move end here |
787 | src->current++; // skip opening brace | |
788 | while(src->current < src->end && noOpenBraces != 0) { | |
789 | if(*src->current == 0x005b) { | |
790 | noOpenBraces++; | |
791 | } else if(*src->current == 0x005D) { // closing brace | |
792 | noOpenBraces--; | |
793 | } | |
794 | src->current++; | |
b75a7d8f | 795 | } |
46f4442e A |
796 | result = UCOL_TOK_SUCCESS; |
797 | break; | |
729e4ab9 A |
798 | case OPTION_SCRIPTREORDER: |
799 | ucol_tok_parseScriptReorder(src, status); | |
800 | break; | |
b75a7d8f | 801 | default: |
46f4442e A |
802 | *status = U_UNSUPPORTED_ERROR; |
803 | break; | |
804 | } | |
b75a7d8f | 805 | } |
729e4ab9 | 806 | src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current)); |
46f4442e | 807 | return result; |
b75a7d8f A |
808 | } |
809 | ||
374ca955 A |
810 | |
811 | inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) { | |
729e4ab9 A |
812 | if (stuff == NULL || len <= 0) { |
813 | return; | |
814 | } | |
815 | UnicodeString tempStuff(FALSE, stuff, len); | |
46f4442e | 816 | if(src->extraCurrent+len >= src->extraEnd) { |
374ca955 | 817 | /* reallocate */ |
729e4ab9 A |
818 | if (stuff >= src->source && stuff <= src->end) { |
819 | // Copy the "stuff" contents into tempStuff's own buffer. | |
820 | // UnicodeString is copy-on-write. | |
821 | if (len > 0) { | |
822 | tempStuff.setCharAt(0, tempStuff[0]); | |
823 | } else { | |
824 | tempStuff.remove(); | |
825 | } | |
826 | } | |
374ca955 A |
827 | UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); |
828 | if(newSrc != NULL) { | |
46f4442e A |
829 | src->current = newSrc + (src->current - src->source); |
830 | src->extraCurrent = newSrc + (src->extraCurrent - src->source); | |
831 | src->end = newSrc + (src->end - src->source); | |
832 | src->extraEnd = newSrc + (src->extraEnd-src->source)*2; | |
833 | src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); | |
834 | src->source = newSrc; | |
374ca955 | 835 | } else { |
46f4442e | 836 | *status = U_MEMORY_ALLOCATION_ERROR; |
729e4ab9 | 837 | return; |
374ca955 | 838 | } |
46f4442e A |
839 | } |
840 | if(len == 1) { | |
729e4ab9 | 841 | *src->extraCurrent++ = tempStuff[0]; |
46f4442e | 842 | } else { |
729e4ab9 | 843 | u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len); |
374ca955 | 844 | src->extraCurrent += len; |
46f4442e | 845 | } |
374ca955 A |
846 | } |
847 | ||
848 | inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { | |
46f4442e A |
849 | /* |
850 | top = TRUE; | |
851 | */ | |
852 | UChar buff[5]; | |
853 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
854 | buff[0] = 0xFFFE; | |
855 | buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); | |
856 | buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); | |
857 | if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) { | |
858 | src->parsedToken.charsLen = 3; | |
859 | ucol_tok_addToExtraCurrent(src, buff, 3, status); | |
860 | } else { | |
861 | buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); | |
862 | buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); | |
863 | src->parsedToken.charsLen = 5; | |
864 | ucol_tok_addToExtraCurrent(src, buff, 5, status); | |
865 | } | |
866 | return TRUE; | |
b75a7d8f A |
867 | } |
868 | ||
374ca955 A |
869 | static UBool isCharNewLine(UChar c){ |
870 | switch(c){ | |
871 | case 0x000A: /* LF */ | |
872 | case 0x000D: /* CR */ | |
873 | case 0x000C: /* FF */ | |
874 | case 0x0085: /* NEL */ | |
875 | case 0x2028: /* LS */ | |
876 | case 0x2029: /* PS */ | |
877 | return TRUE; | |
878 | default: | |
879 | return FALSE; | |
880 | } | |
881 | } | |
882 | ||
729e4ab9 A |
883 | /* |
884 | * This function is called several times when a range is processed. Each time, the next code point | |
885 | * is processed. | |
886 | * The following variables must be set before calling this function: | |
887 | * src->currentRangeCp: The current code point to process. | |
888 | * src->lastRangeCp: The last code point in the range. | |
889 | * Pre-requisite: src->currentRangeCp <= src->lastRangeCp. | |
890 | */ | |
891 | static const UChar* | |
892 | ucol_tok_processNextCodePointInRange(UColTokenParser *src, | |
893 | UErrorCode *status) | |
894 | { | |
895 | // Append current code point to source | |
896 | UChar buff[U16_MAX_LENGTH]; | |
897 | uint32_t i = 0; | |
898 | ||
899 | uint32_t nChars = U16_LENGTH(src->currentRangeCp); | |
900 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
901 | src->parsedToken.charsLen = nChars; | |
902 | ||
903 | U16_APPEND_UNSAFE(buff, i, src->currentRangeCp); | |
904 | ucol_tok_addToExtraCurrent(src, buff, nChars, status); | |
905 | ||
906 | ++src->currentRangeCp; | |
907 | if (src->currentRangeCp > src->lastRangeCp) { | |
908 | src->inRange = FALSE; | |
909 | ||
910 | if (src->currentStarredCharIndex > src->lastStarredCharIndex) { | |
911 | src->isStarred = FALSE; | |
912 | } | |
913 | } else { | |
914 | src->previousCp = src->currentRangeCp; | |
915 | } | |
916 | return src->current; | |
917 | } | |
918 | ||
919 | /* | |
920 | * This function is called several times when a starred list is processed. Each time, the next code point | |
921 | * in the list is processed. | |
922 | * The following variables must be set before calling this function: | |
923 | * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point. | |
924 | * src->lastStarredCharIndex: Index to the last character in the list. | |
925 | * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex. | |
926 | */ | |
927 | static const UChar* | |
928 | ucol_tok_processNextTokenInStarredList(UColTokenParser *src) | |
929 | { | |
930 | // Extract the characters corresponding to the next code point. | |
931 | UChar32 cp; | |
932 | src->parsedToken.charsOffset = src->currentStarredCharIndex; | |
933 | int32_t prev = src->currentStarredCharIndex; | |
934 | U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp); | |
935 | src->parsedToken.charsLen = src->currentStarredCharIndex - prev; | |
936 | ||
937 | // When we are done parsing the starred string, turn the flag off so that | |
938 | // the normal processing is restored. | |
939 | if (src->currentStarredCharIndex > src->lastStarredCharIndex) { | |
940 | src->isStarred = FALSE; | |
941 | } | |
942 | src->previousCp = cp; | |
943 | return src->current; | |
944 | } | |
945 | ||
946 | /* | |
947 | * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters. | |
948 | * | |
949 | * This routine parses and separates almost all tokens. The following are the syntax characters recognized. | |
950 | * # : Comment character | |
951 | * & : Reset operator | |
952 | * = : Equality | |
953 | * < : Primary collation | |
954 | * << : Secondary collation | |
955 | * <<< : Tertiary collation | |
956 | * ; : Secondary collation | |
957 | * , : Tertiary collation | |
958 | * / : Expansions | |
959 | * | : Prefix | |
960 | * - : Range | |
961 | ||
962 | * ! : Java Thai modifier, ignored | |
963 | * @ : French only | |
964 | ||
965 | * [] : Options | |
966 | * '' : Quotes | |
967 | * | |
968 | * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz | |
969 | * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above. | |
970 | * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a", | |
971 | * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous | |
972 | * character returned as cached so that the calling program can do further splitting. | |
973 | */ | |
974 | static const UChar* | |
975 | ucol_tok_parseNextTokenInternal(UColTokenParser *src, | |
976 | UBool startOfRules, | |
977 | UParseError *parseError, | |
978 | UErrorCode *status) | |
46f4442e | 979 | { |
46f4442e A |
980 | UBool variableTop = FALSE; |
981 | UBool top = FALSE; | |
982 | UBool inChars = TRUE; | |
983 | UBool inQuote = FALSE; | |
984 | UBool wasInQuote = FALSE; | |
985 | uint8_t before = 0; | |
986 | UBool isEscaped = FALSE; | |
729e4ab9 | 987 | |
46f4442e A |
988 | // TODO: replace these variables with src->parsedToken counterparts |
989 | // no need to use them anymore since we have src->parsedToken. | |
990 | // Ideally, token parser would be a nice class... Once, when I have | |
991 | // more time (around 2020 probably). | |
992 | uint32_t newExtensionLen = 0; | |
993 | uint32_t extensionOffset = 0; | |
994 | uint32_t newStrength = UCOL_TOK_UNSET; | |
995 | UChar buff[10]; | |
996 | ||
997 | src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; | |
998 | src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; | |
999 | src->parsedToken.indirectIndex = 0; | |
1000 | ||
1001 | while (src->current < src->end) { | |
1002 | UChar ch = *(src->current); | |
1003 | ||
1004 | if (inQuote) { | |
1005 | if (ch == 0x0027/*'\''*/) { | |
1006 | inQuote = FALSE; | |
1007 | } else { | |
1008 | if ((src->parsedToken.charsLen == 0) || inChars) { | |
1009 | if(src->parsedToken.charsLen == 0) { | |
1010 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
1011 | } | |
1012 | src->parsedToken.charsLen++; | |
1013 | } else { | |
1014 | if(newExtensionLen == 0) { | |
1015 | extensionOffset = (uint32_t)(src->extraCurrent - src->source); | |
1016 | } | |
1017 | newExtensionLen++; | |
1018 | } | |
b75a7d8f | 1019 | } |
46f4442e A |
1020 | }else if(isEscaped){ |
1021 | isEscaped =FALSE; | |
1022 | if (newStrength == UCOL_TOK_UNSET) { | |
1023 | *status = U_INVALID_FORMAT_ERROR; | |
1024 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
729e4ab9 | 1025 | DBG_FORMAT_ERROR |
46f4442e A |
1026 | return NULL; |
1027 | // enabling rules to start with non-tokens a < b | |
1028 | // newStrength = UCOL_TOK_RESET; | |
b75a7d8f | 1029 | } |
46f4442e A |
1030 | if(ch != 0x0000 && src->current != src->end) { |
1031 | if (inChars) { | |
1032 | if(src->parsedToken.charsLen == 0) { | |
1033 | src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); | |
1034 | } | |
1035 | src->parsedToken.charsLen++; | |
1036 | } else { | |
1037 | if(newExtensionLen == 0) { | |
1038 | extensionOffset = (uint32_t)(src->current - src->source); | |
1039 | } | |
1040 | newExtensionLen++; | |
1041 | } | |
b75a7d8f | 1042 | } |
46f4442e A |
1043 | }else { |
1044 | if(!uprv_isRuleWhiteSpace(ch)) { | |
1045 | /* Sets the strength for this entry */ | |
1046 | switch (ch) { | |
1047 | case 0x003D/*'='*/ : | |
1048 | if (newStrength != UCOL_TOK_UNSET) { | |
1049 | goto EndOfLoop; | |
1050 | } | |
b75a7d8f | 1051 | |
46f4442e A |
1052 | /* if we start with strength, we'll reset to top */ |
1053 | if(startOfRules == TRUE) { | |
1054 | src->parsedToken.indirectIndex = 5; | |
1055 | top = ucol_tok_doSetTop(src, status); | |
1056 | newStrength = UCOL_TOK_RESET; | |
1057 | goto EndOfLoop; | |
1058 | } | |
1059 | newStrength = UCOL_IDENTICAL; | |
729e4ab9 A |
1060 | if(*(src->current+1) == 0x002A) {/*'*'*/ |
1061 | src->current++; | |
1062 | src->isStarred = TRUE; | |
1063 | } | |
46f4442e | 1064 | break; |
b75a7d8f | 1065 | |
46f4442e A |
1066 | case 0x002C/*','*/: |
1067 | if (newStrength != UCOL_TOK_UNSET) { | |
1068 | goto EndOfLoop; | |
1069 | } | |
b75a7d8f | 1070 | |
46f4442e A |
1071 | /* if we start with strength, we'll reset to top */ |
1072 | if(startOfRules == TRUE) { | |
1073 | src->parsedToken.indirectIndex = 5; | |
1074 | top = ucol_tok_doSetTop(src, status); | |
1075 | newStrength = UCOL_TOK_RESET; | |
1076 | goto EndOfLoop; | |
1077 | } | |
1078 | newStrength = UCOL_TERTIARY; | |
1079 | break; | |
b75a7d8f | 1080 | |
46f4442e A |
1081 | case 0x003B/*';'*/: |
1082 | if (newStrength != UCOL_TOK_UNSET) { | |
1083 | goto EndOfLoop; | |
1084 | } | |
b75a7d8f | 1085 | |
46f4442e A |
1086 | /* if we start with strength, we'll reset to top */ |
1087 | if(startOfRules == TRUE) { | |
1088 | src->parsedToken.indirectIndex = 5; | |
1089 | top = ucol_tok_doSetTop(src, status); | |
1090 | newStrength = UCOL_TOK_RESET; | |
1091 | goto EndOfLoop; | |
b75a7d8f | 1092 | } |
46f4442e A |
1093 | newStrength = UCOL_SECONDARY; |
1094 | break; | |
b75a7d8f | 1095 | |
46f4442e A |
1096 | case 0x003C/*'<'*/: |
1097 | if (newStrength != UCOL_TOK_UNSET) { | |
1098 | goto EndOfLoop; | |
1099 | } | |
b75a7d8f | 1100 | |
46f4442e A |
1101 | /* if we start with strength, we'll reset to top */ |
1102 | if(startOfRules == TRUE) { | |
1103 | src->parsedToken.indirectIndex = 5; | |
1104 | top = ucol_tok_doSetTop(src, status); | |
1105 | newStrength = UCOL_TOK_RESET; | |
1106 | goto EndOfLoop; | |
1107 | } | |
1108 | /* before this, do a scan to verify whether this is */ | |
1109 | /* another strength */ | |
1110 | if(*(src->current+1) == 0x003C) { | |
1111 | src->current++; | |
1112 | if(*(src->current+1) == 0x003C) { | |
1113 | src->current++; /* three in a row! */ | |
1114 | newStrength = UCOL_TERTIARY; | |
1115 | } else { /* two in a row */ | |
1116 | newStrength = UCOL_SECONDARY; | |
1117 | } | |
1118 | } else { /* just one */ | |
1119 | newStrength = UCOL_PRIMARY; | |
1120 | } | |
729e4ab9 A |
1121 | if(*(src->current+1) == 0x002A) {/*'*'*/ |
1122 | src->current++; | |
1123 | src->isStarred = TRUE; | |
1124 | } | |
46f4442e | 1125 | break; |
b75a7d8f | 1126 | |
46f4442e A |
1127 | case 0x0026/*'&'*/: |
1128 | if (newStrength != UCOL_TOK_UNSET) { | |
1129 | /**/ | |
1130 | goto EndOfLoop; | |
1131 | } | |
b75a7d8f | 1132 | |
46f4442e A |
1133 | newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ |
1134 | break; | |
1135 | ||
1136 | case 0x005b/*'['*/: | |
1137 | /* options - read an option, analyze it */ | |
1138 | if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { | |
1139 | uint8_t result = ucol_uprv_tok_readAndSetOption(src, status); | |
1140 | if(U_SUCCESS(*status)) { | |
1141 | if(result & UCOL_TOK_TOP) { | |
1142 | if(newStrength == UCOL_TOK_RESET) { | |
1143 | top = ucol_tok_doSetTop(src, status); | |
1144 | if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b' | |
1145 | src->parsedToken.charsLen+=2; | |
1146 | buff[0] = 0x002d; | |
1147 | buff[1] = before; | |
1148 | ucol_tok_addToExtraCurrent(src, buff, 2, status); | |
1149 | } | |
1150 | ||
1151 | src->current++; | |
1152 | goto EndOfLoop; | |
1153 | } else { | |
1154 | *status = U_INVALID_FORMAT_ERROR; | |
1155 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
729e4ab9 | 1156 | DBG_FORMAT_ERROR |
46f4442e A |
1157 | } |
1158 | } else if(result & UCOL_TOK_VARIABLE_TOP) { | |
1159 | if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { | |
1160 | variableTop = TRUE; | |
1161 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
1162 | src->parsedToken.charsLen = 1; | |
1163 | buff[0] = 0xFFFF; | |
1164 | ucol_tok_addToExtraCurrent(src, buff, 1, status); | |
1165 | src->current++; | |
1166 | goto EndOfLoop; | |
1167 | } else { | |
1168 | *status = U_INVALID_FORMAT_ERROR; | |
1169 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
729e4ab9 | 1170 | DBG_FORMAT_ERROR |
46f4442e A |
1171 | } |
1172 | } else if (result & UCOL_TOK_BEFORE){ | |
1173 | if(newStrength == UCOL_TOK_RESET) { | |
1174 | before = result & UCOL_TOK_BEFORE; | |
1175 | } else { | |
1176 | *status = U_INVALID_FORMAT_ERROR; | |
1177 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
729e4ab9 | 1178 | DBG_FORMAT_ERROR |
46f4442e A |
1179 | } |
1180 | } | |
1181 | } else { | |
1182 | *status = U_INVALID_FORMAT_ERROR; | |
1183 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
729e4ab9 | 1184 | DBG_FORMAT_ERROR |
46f4442e A |
1185 | return NULL; |
1186 | } | |
1187 | } | |
1188 | break; | |
1189 | case 0x0021/*! skip java thai modifier reordering*/: | |
1190 | break; | |
1191 | case 0x002F/*'/'*/: | |
1192 | wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ | |
1193 | inChars = FALSE; /* we're now processing expansion */ | |
1194 | break; | |
1195 | case 0x005C /* back slash for escaped chars */: | |
1196 | isEscaped = TRUE; | |
1197 | break; | |
1198 | /* found a quote, we're gonna start copying */ | |
1199 | case 0x0027/*'\''*/: | |
1200 | if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ | |
729e4ab9 A |
1201 | *status = U_INVALID_FORMAT_ERROR; |
1202 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
1203 | DBG_FORMAT_ERROR | |
1204 | return NULL; | |
1205 | // enabling rules to start with a non-token character a < b | |
1206 | // newStrength = UCOL_TOK_RESET; | |
46f4442e | 1207 | } |
b75a7d8f | 1208 | |
46f4442e A |
1209 | inQuote = TRUE; |
1210 | ||
1211 | if(inChars) { /* we're doing characters */ | |
1212 | if(wasInQuote == FALSE) { | |
1213 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
1214 | } | |
1215 | if (src->parsedToken.charsLen != 0) { | |
1216 | ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); | |
1217 | } | |
1218 | src->parsedToken.charsLen++; | |
1219 | } else { /* we're doing an expansion */ | |
1220 | if(wasInQuote == FALSE) { | |
1221 | extensionOffset = (uint32_t)(src->extraCurrent - src->source); | |
1222 | } | |
1223 | if (newExtensionLen != 0) { | |
1224 | ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status); | |
1225 | } | |
1226 | newExtensionLen++; | |
1227 | } | |
b75a7d8f | 1228 | |
46f4442e | 1229 | wasInQuote = TRUE; |
b75a7d8f | 1230 | |
46f4442e A |
1231 | ch = *(++(src->current)); |
1232 | if(ch == 0x0027) { /* copy the double quote */ | |
1233 | ucol_tok_addToExtraCurrent(src, &ch, 1, status); | |
1234 | inQuote = FALSE; | |
1235 | } | |
1236 | break; | |
1237 | ||
1238 | /* '@' is french only if the strength is not currently set */ | |
1239 | /* if it is, it's just a regular character in collation rules */ | |
1240 | case 0x0040/*'@'*/: | |
1241 | if (newStrength == UCOL_TOK_UNSET) { | |
1242 | src->opts->frenchCollation = UCOL_ON; | |
1243 | break; | |
1244 | } | |
b75a7d8f | 1245 | |
46f4442e A |
1246 | case 0x007C /*|*/: /* this means we have actually been reading prefix part */ |
1247 | // we want to store read characters to the prefix part and continue reading | |
1248 | // the characters (proper way would be to restart reading the chars, but in | |
1249 | // that case we would have to complicate the token hasher, which I do not | |
1250 | // intend to play with. Instead, we will do prefixes when prefixes are due | |
1251 | // (before adding the elements). | |
1252 | src->parsedToken.prefixOffset = src->parsedToken.charsOffset; | |
1253 | src->parsedToken.prefixLen = src->parsedToken.charsLen; | |
1254 | ||
1255 | if(inChars) { /* we're doing characters */ | |
1256 | if(wasInQuote == FALSE) { | |
1257 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
1258 | } | |
1259 | if (src->parsedToken.charsLen != 0) { | |
1260 | ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); | |
1261 | } | |
1262 | src->parsedToken.charsLen++; | |
1263 | } | |
b75a7d8f | 1264 | |
46f4442e A |
1265 | wasInQuote = TRUE; |
1266 | ||
1267 | do { | |
1268 | ch = *(++(src->current)); | |
1269 | // skip whitespace between '|' and the character | |
1270 | } while (uprv_isRuleWhiteSpace(ch)); | |
1271 | break; | |
1272 | ||
1273 | //charsOffset = 0; | |
1274 | //newCharsLen = 0; | |
1275 | //break; // We want to store the whole prefix/character sequence. If we break | |
1276 | // the '|' is going to get lost. | |
729e4ab9 A |
1277 | |
1278 | case 0x002D /*-*/: /* A range. */ | |
1279 | if (newStrength != UCOL_TOK_UNSET) { | |
1280 | // While processing the pending token, the isStarred field | |
1281 | // is reset, so it needs to be saved for the next | |
1282 | // invocation. | |
1283 | src->savedIsStarred = src->isStarred; | |
1284 | goto EndOfLoop; | |
1285 | } | |
1286 | src->isStarred = src->savedIsStarred; | |
1287 | ||
1288 | // Ranges are valid only in starred tokens. | |
1289 | if (!src->isStarred) { | |
1290 | *status = U_INVALID_FORMAT_ERROR; | |
1291 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
1292 | DBG_FORMAT_ERROR | |
1293 | return NULL; | |
1294 | } | |
1295 | newStrength = src->parsedToken.strength; | |
1296 | src->inRange = TRUE; | |
1297 | break; | |
1298 | ||
46f4442e A |
1299 | case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */ |
1300 | do { | |
1301 | ch = *(++(src->current)); | |
1302 | } while (!isCharNewLine(ch)); | |
1303 | ||
1304 | break; | |
1305 | default: | |
1306 | if (newStrength == UCOL_TOK_UNSET) { | |
729e4ab9 A |
1307 | *status = U_INVALID_FORMAT_ERROR; |
1308 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
1309 | DBG_FORMAT_ERROR | |
1310 | return NULL; | |
46f4442e | 1311 | } |
374ca955 | 1312 | |
46f4442e A |
1313 | if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { |
1314 | *status = U_INVALID_FORMAT_ERROR; | |
1315 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
729e4ab9 | 1316 | DBG_FORMAT_ERROR |
46f4442e A |
1317 | return NULL; |
1318 | } | |
374ca955 | 1319 | |
46f4442e A |
1320 | if(ch == 0x0000 && src->current+1 == src->end) { |
1321 | break; | |
1322 | } | |
b75a7d8f | 1323 | |
46f4442e A |
1324 | if (inChars) { |
1325 | if(src->parsedToken.charsLen == 0) { | |
1326 | src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); | |
1327 | } | |
1328 | src->parsedToken.charsLen++; | |
1329 | } else { | |
1330 | if(newExtensionLen == 0) { | |
1331 | extensionOffset = (uint32_t)(src->current - src->source); | |
1332 | } | |
1333 | newExtensionLen++; | |
1334 | } | |
b75a7d8f | 1335 | |
46f4442e A |
1336 | break; |
1337 | } | |
b75a7d8f | 1338 | } |
46f4442e | 1339 | } |
b75a7d8f | 1340 | |
46f4442e A |
1341 | if(wasInQuote) { |
1342 | if(ch != 0x27) { | |
1343 | if(inQuote || !uprv_isRuleWhiteSpace(ch)) { | |
1344 | ucol_tok_addToExtraCurrent(src, &ch, 1, status); | |
1345 | } | |
b75a7d8f | 1346 | } |
46f4442e | 1347 | } |
b75a7d8f | 1348 | |
46f4442e | 1349 | src->current++; |
b75a7d8f A |
1350 | } |
1351 | ||
46f4442e A |
1352 | EndOfLoop: |
1353 | wasInQuote = FALSE; | |
1354 | if (newStrength == UCOL_TOK_UNSET) { | |
1355 | return NULL; | |
b75a7d8f A |
1356 | } |
1357 | ||
46f4442e A |
1358 | if (src->parsedToken.charsLen == 0 && top == FALSE) { |
1359 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
1360 | *status = U_INVALID_FORMAT_ERROR; | |
729e4ab9 | 1361 | DBG_FORMAT_ERROR |
46f4442e | 1362 | return NULL; |
b75a7d8f A |
1363 | } |
1364 | ||
46f4442e A |
1365 | src->parsedToken.strength = newStrength; |
1366 | src->parsedToken.extensionOffset = extensionOffset; | |
1367 | src->parsedToken.extensionLen = newExtensionLen; | |
1368 | src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; | |
b75a7d8f | 1369 | |
46f4442e | 1370 | return src->current; |
b75a7d8f A |
1371 | } |
1372 | ||
729e4ab9 A |
1373 | /* |
1374 | * Parses the next token, keeps the indices in src->parsedToken, and updates the counters. | |
1375 | * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported. | |
1376 | * | |
1377 | * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following: | |
1378 | * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates | |
1379 | * it to separate tokens and returns one by one. In order to do that, the necessary states are | |
1380 | * cached as member variables of the token parser. | |
1381 | * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the | |
1382 | * starting character as a single list token (which is separated into individual characters here) | |
1383 | * and as another list token starting with the last character in the range. Before expanding it | |
1384 | * as a list of tokens, this function expands the range by filling the intermediate characters and | |
1385 | * returns them one by one as separate tokens. | |
1386 | * Necessary checks are done for invalid combinations. | |
1387 | */ | |
1388 | U_CAPI const UChar* U_EXPORT2 | |
1389 | ucol_tok_parseNextToken(UColTokenParser *src, | |
1390 | UBool startOfRules, | |
1391 | UParseError *parseError, | |
1392 | UErrorCode *status) | |
1393 | { | |
1394 | const UChar *nextToken; | |
1395 | ||
1396 | if (src->inRange) { | |
1397 | // We are not done processing a range. Continue it. | |
1398 | return ucol_tok_processNextCodePointInRange(src, status); | |
1399 | } else if (src->isStarred) { | |
1400 | // We are not done processing a starred token. Continue it. | |
1401 | return ucol_tok_processNextTokenInStarredList(src); | |
1402 | } | |
1403 | ||
1404 | // Get the next token. | |
1405 | nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status); | |
1406 | ||
1407 | if (nextToken == NULL) { | |
1408 | return NULL; | |
1409 | } | |
1410 | ||
1411 | if (src->inRange) { | |
1412 | // A new range has started. | |
1413 | // Check whether it is a chain of ranges with more than one hyphen. | |
1414 | if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) { | |
1415 | *status = U_INVALID_FORMAT_ERROR; | |
1416 | syntaxError(src->source,src->parsedToken.charsOffset-1, | |
1417 | src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError); | |
1418 | DBG_FORMAT_ERROR | |
1419 | return NULL; | |
1420 | } | |
1421 | ||
1422 | // The current token indicates the second code point of the range. | |
1423 | // Process just that, and then proceed with the star. | |
1424 | src->currentStarredCharIndex = src->parsedToken.charsOffset; | |
1425 | U16_NEXT(src->source, src->currentStarredCharIndex, | |
1426 | (uint32_t)(src->end - src->source), src->lastRangeCp); | |
1427 | if (src->lastRangeCp <= src->previousCp) { | |
1428 | *status = U_INVALID_FORMAT_ERROR; | |
1429 | syntaxError(src->source,src->parsedToken.charsOffset-1, | |
1430 | src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); | |
1431 | DBG_FORMAT_ERROR | |
1432 | return NULL; | |
1433 | } | |
1434 | ||
1435 | // Set current range code point to process the range loop | |
1436 | src->currentRangeCp = src->previousCp + 1; | |
1437 | ||
1438 | src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; | |
1439 | ||
1440 | return ucol_tok_processNextCodePointInRange(src, status); | |
1441 | } else if (src->isStarred) { | |
1442 | // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that | |
1443 | // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be | |
1444 | // separated into several tokens and returned. | |
1445 | src->currentStarredCharIndex = src->parsedToken.charsOffset; | |
1446 | src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; | |
1447 | ||
1448 | return ucol_tok_processNextTokenInStarredList(src); | |
1449 | } else { | |
1450 | // Set previous codepoint | |
1451 | U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp); | |
1452 | } | |
1453 | return nextToken; | |
1454 | } | |
1455 | ||
1456 | ||
b75a7d8f A |
1457 | /* |
1458 | Processing Description | |
46f4442e A |
1459 | 1 Build a ListList. Each list has a header, which contains two lists (positive |
1460 | and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and | |
1461 | reset may be null. | |
1462 | 2 As you process, you keep a LAST pointer that points to the last token you | |
1463 | handled. | |
729e4ab9 | 1464 | |
b75a7d8f A |
1465 | */ |
1466 | ||
729e4ab9 | 1467 | static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext, |
46f4442e A |
1468 | UParseError *parseError, UErrorCode *status) |
1469 | { | |
1470 | if(src->resultLen == src->listCapacity) { | |
1471 | // Unfortunately, this won't work, as we store addresses of lhs in token | |
1472 | src->listCapacity *= 2; | |
1473 | src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader)); | |
1474 | if(src->lh == NULL) { | |
1475 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1476 | return NULL; | |
1477 | } | |
b75a7d8f | 1478 | } |
46f4442e A |
1479 | /* do the reset thing */ |
1480 | UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); | |
1481 | /* test for NULL */ | |
1482 | if (sourceToken == NULL) { | |
1483 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1484 | return NULL; | |
1485 | } | |
729e4ab9 | 1486 | sourceToken->rulesToParseHdl = &(src->source); |
46f4442e A |
1487 | sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
1488 | sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; | |
1489 | ||
1490 | sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); | |
1491 | sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); | |
1492 | ||
1493 | // keep the flags around so that we know about before | |
1494 | sourceToken->flags = src->parsedToken.flags; | |
1495 | ||
1496 | if(src->parsedToken.prefixOffset != 0) { | |
1497 | // this is a syntax error | |
1498 | *status = U_INVALID_FORMAT_ERROR; | |
1499 | syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); | |
729e4ab9 | 1500 | DBG_FORMAT_ERROR |
46f4442e A |
1501 | uprv_free(sourceToken); |
1502 | return 0; | |
b75a7d8f | 1503 | } else { |
46f4442e | 1504 | sourceToken->prefix = 0; |
b75a7d8f | 1505 | } |
b75a7d8f | 1506 | |
46f4442e A |
1507 | sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ |
1508 | sourceToken->strength = UCOL_TOK_RESET; | |
1509 | sourceToken->next = NULL; | |
1510 | sourceToken->previous = NULL; | |
1511 | sourceToken->noOfCEs = 0; | |
1512 | sourceToken->noOfExpCEs = 0; | |
1513 | sourceToken->listHeader = &src->lh[src->resultLen]; | |
1514 | ||
1515 | src->lh[src->resultLen].first = NULL; | |
1516 | src->lh[src->resultLen].last = NULL; | |
1517 | src->lh[src->resultLen].first = NULL; | |
1518 | src->lh[src->resultLen].last = NULL; | |
b75a7d8f | 1519 | |
46f4442e | 1520 | src->lh[src->resultLen].reset = sourceToken; |
b75a7d8f | 1521 | |
46f4442e A |
1522 | /* |
1523 | 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... | |
1524 | First convert all expansions into normal form. Examples: | |
1525 | If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * | |
1526 | d * ... into &x * c/y * d * ... | |
1527 | Note: reset values can never have expansions, although they can cause the | |
1528 | very next item to have one. They may be contractions, if they are found | |
1529 | earlier in the list. | |
1530 | */ | |
1531 | *expandNext = 0; | |
1532 | if(expand != NULL) { | |
1533 | /* check to see if there is an expansion */ | |
1534 | if(src->parsedToken.charsLen > 1) { | |
1535 | uint32_t resetCharsOffset; | |
1536 | resetCharsOffset = (uint32_t)(expand - src->source); | |
1537 | sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset; | |
1538 | *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); | |
1539 | } | |
1540 | } | |
1541 | ||
1542 | src->resultLen++; | |
1543 | ||
1544 | uhash_put(src->tailored, sourceToken, sourceToken, status); | |
1545 | ||
1546 | return sourceToken; | |
b75a7d8f A |
1547 | } |
1548 | ||
1549 | static | |
1550 | inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) { | |
46f4442e A |
1551 | if(U_FAILURE(*status)) { |
1552 | return NULL; | |
1553 | } | |
1554 | /* this is a virgin before - we need to fish the anchor from the UCA */ | |
1555 | collIterate s; | |
1556 | uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; | |
1557 | uint32_t CE, SecondCE; | |
1558 | uint32_t invPos; | |
1559 | if(sourceToken != NULL) { | |
729e4ab9 | 1560 | uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status); |
46f4442e | 1561 | } else { |
729e4ab9 A |
1562 | uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status); |
1563 | } | |
1564 | if(U_FAILURE(*status)) { | |
1565 | return NULL; | |
46f4442e A |
1566 | } |
1567 | ||
1568 | baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; | |
1569 | baseContCE = ucol_getNextCE(src->UCA, &s, status); | |
1570 | if(baseContCE == UCOL_NO_MORE_CES) { | |
1571 | baseContCE = 0; | |
1572 | } | |
1573 | ||
1574 | ||
1575 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); | |
1576 | uint32_t ch = 0; | |
1577 | uint32_t expandNext = 0; | |
1578 | UColToken key; | |
1579 | ||
1580 | if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ | |
729e4ab9 | 1581 | uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); |
46f4442e A |
1582 | uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
1583 | ch = uprv_uca_getCodePointFromRaw(raw-1); | |
1584 | uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); | |
729e4ab9 A |
1585 | CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
1586 | SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; | |
46f4442e A |
1587 | |
1588 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
1589 | *src->extraCurrent++ = 0xFFFE; | |
1590 | *src->extraCurrent++ = (UChar)ch; | |
1591 | src->parsedToken.charsLen++; | |
1592 | ||
1593 | key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; | |
729e4ab9 | 1594 | key.rulesToParseHdl = &(src->source); |
46f4442e A |
1595 | |
1596 | //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); | |
1597 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
1598 | ||
1599 | if(sourceToken == NULL) { | |
1600 | src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; | |
1601 | if(isContinuation(SecondCE)) { | |
1602 | src->lh[src->resultLen].baseContCE = SecondCE; | |
1603 | } else { | |
1604 | src->lh[src->resultLen].baseContCE = 0; | |
1605 | } | |
1606 | src->lh[src->resultLen].nextCE = 0; | |
1607 | src->lh[src->resultLen].nextContCE = 0; | |
1608 | src->lh[src->resultLen].previousCE = 0; | |
1609 | src->lh[src->resultLen].previousContCE = 0; | |
1610 | ||
1611 | src->lh[src->resultLen].indirect = FALSE; | |
1612 | ||
1613 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); | |
1614 | } | |
1615 | ||
1616 | } else { | |
1617 | invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); | |
1618 | ||
1619 | // we got the previous CE. Now we need to see if the difference between | |
1620 | // the two CEs is really of the requested strength. | |
1621 | // if it's a bigger difference (we asked for secondary and got primary), we | |
1622 | // need to modify the CE. | |
1623 | if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) { | |
1624 | // adjust the strength | |
1625 | // now we are in the situation where our baseCE should actually be modified in | |
1626 | // order to get the CE in the right position. | |
374ca955 | 1627 | if(strength == UCOL_SECONDARY) { |
46f4442e | 1628 | CE = baseCE - 0x0200; |
374ca955 | 1629 | } else { // strength == UCOL_TERTIARY |
46f4442e A |
1630 | CE = baseCE - 0x02; |
1631 | } | |
1632 | if(baseContCE) { | |
1633 | if(strength == UCOL_SECONDARY) { | |
1634 | SecondCE = baseContCE - 0x0200; | |
1635 | } else { // strength == UCOL_TERTIARY | |
1636 | SecondCE = baseContCE - 0x02; | |
1637 | } | |
374ca955 | 1638 | } |
46f4442e | 1639 | } |
374ca955 A |
1640 | |
1641 | #if 0 | |
46f4442e A |
1642 | // the code below relies on getting a code point from the inverse table, in order to be |
1643 | // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: | |
1644 | // 1. There are many code points that have the same CE | |
1645 | // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. | |
1646 | // Also, in case when there is no equivalent strength before an element, we have to actually | |
1647 | // construct one. For example, &[before 2]a << x won't result in x << a, because the element | |
1648 | // before a is a primary difference. | |
374ca955 | 1649 | |
46f4442e | 1650 | //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
374ca955 A |
1651 | |
1652 | ||
46f4442e | 1653 | ch = CETable[3*invPos+2]; |
374ca955 | 1654 | |
46f4442e A |
1655 | if((ch & UCOL_INV_SIZEMASK) != 0) { |
1656 | uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts); | |
1657 | uint32_t offset = (ch & UCOL_INV_OFFSETMASK); | |
1658 | ch = conts[offset]; | |
1659 | } | |
374ca955 | 1660 | |
46f4442e A |
1661 | *src->extraCurrent++ = (UChar)ch; |
1662 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1); | |
1663 | src->parsedToken.charsLen = 1; | |
374ca955 | 1664 | |
46f4442e A |
1665 | // We got an UCA before. However, this might have been tailored. |
1666 | // example: | |
1667 | // &\u30ca = \u306a | |
1668 | // &[before 3]\u306a<<<\u306a|\u309d | |
374ca955 A |
1669 | |
1670 | ||
46f4442e A |
1671 | // uint32_t key = (*newCharsLen << 24) | *charsOffset; |
1672 | key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; | |
729e4ab9 | 1673 | key.rulesToParseHdl = &(src->source); |
374ca955 | 1674 | |
46f4442e A |
1675 | //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); |
1676 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
374ca955 A |
1677 | #endif |
1678 | ||
46f4442e A |
1679 | // here is how it should be. The situation such as &[before 1]a < x, should be |
1680 | // resolved exactly as if we wrote &a > x. | |
1681 | // therefore, I don't really care if the UCA value before a has been changed. | |
1682 | // However, I do care if the strength between my element and the previous element | |
1683 | // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll | |
1684 | // have to construct the base CE. | |
374ca955 A |
1685 | |
1686 | ||
1687 | ||
46f4442e A |
1688 | // if we found a tailored thing, we have to use the UCA value and construct |
1689 | // a new reset token with constructed name | |
1690 | //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { | |
374ca955 A |
1691 | // character to which we want to anchor is already tailored. |
1692 | // We need to construct a new token which will be the anchor | |
1693 | // point | |
1694 | //*(src->extraCurrent-1) = 0xFFFE; | |
1695 | //*src->extraCurrent++ = (UChar)ch; | |
1696 | // grab before | |
1697 | src->parsedToken.charsOffset -= 10; | |
1698 | src->parsedToken.charsLen += 10; | |
1699 | src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; | |
1700 | if(isContinuation(SecondCE)) { | |
46f4442e | 1701 | src->lh[src->resultLen].baseContCE = SecondCE; |
374ca955 | 1702 | } else { |
46f4442e | 1703 | src->lh[src->resultLen].baseContCE = 0; |
374ca955 A |
1704 | } |
1705 | src->lh[src->resultLen].nextCE = 0; | |
1706 | src->lh[src->resultLen].nextContCE = 0; | |
1707 | src->lh[src->resultLen].previousCE = 0; | |
1708 | src->lh[src->resultLen].previousContCE = 0; | |
b75a7d8f | 1709 | |
374ca955 | 1710 | src->lh[src->resultLen].indirect = FALSE; |
b75a7d8f | 1711 | |
374ca955 | 1712 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
46f4442e A |
1713 | //} |
1714 | } | |
b75a7d8f | 1715 | |
46f4442e | 1716 | return sourceToken; |
b75a7d8f A |
1717 | |
1718 | } | |
1719 | ||
1720 | uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) { | |
46f4442e A |
1721 | UColToken *lastToken = NULL; |
1722 | const UChar *parseEnd = NULL; | |
1723 | uint32_t expandNext = 0; | |
1724 | UBool variableTop = FALSE; | |
1725 | UBool top = FALSE; | |
1726 | uint16_t specs = 0; | |
1727 | UColTokListHeader *ListList = NULL; | |
b75a7d8f | 1728 | |
46f4442e | 1729 | src->parsedToken.strength = UCOL_TOK_UNSET; |
b75a7d8f | 1730 | |
46f4442e | 1731 | ListList = src->lh; |
b75a7d8f | 1732 | |
46f4442e A |
1733 | if(U_FAILURE(*status)) { |
1734 | return 0; | |
1735 | } | |
729e4ab9 A |
1736 | #ifdef DEBUG_FOR_CODE_POINTS |
1737 | char filename[35]; | |
1738 | sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid()); | |
1739 | dfcp_fp = fopen(filename, "a"); | |
1740 | fprintf(stdout, "Output is in the file %s.\n", filename); | |
1741 | #endif | |
1742 | ||
1743 | #ifdef DEBUG_FOR_COLL_RULES | |
1744 | std::string s3; | |
1745 | UnicodeString(src->source).toUTF8String(s3); | |
1746 | std::cout << "src->source = " << s3 << std::endl; | |
1747 | #endif | |
b75a7d8f | 1748 | |
729e4ab9 | 1749 | while(src->current < src->end || src->isStarred) { |
46f4442e | 1750 | src->parsedToken.prefixOffset = 0; |
374ca955 | 1751 | |
46f4442e A |
1752 | parseEnd = ucol_tok_parseNextToken(src, |
1753 | (UBool)(lastToken == NULL), | |
1754 | parseError, | |
1755 | status); | |
b75a7d8f | 1756 | |
46f4442e | 1757 | specs = src->parsedToken.flags; |
b75a7d8f A |
1758 | |
1759 | ||
46f4442e A |
1760 | variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); |
1761 | top = ((specs & UCOL_TOK_TOP) != 0); | |
b75a7d8f | 1762 | |
46f4442e A |
1763 | if(U_SUCCESS(*status) && parseEnd != NULL) { |
1764 | UColToken *sourceToken = NULL; | |
1765 | //uint32_t key = 0; | |
1766 | uint32_t lastStrength = UCOL_TOK_UNSET; | |
374ca955 | 1767 | |
46f4442e A |
1768 | if(lastToken != NULL ) { |
1769 | lastStrength = lastToken->strength; | |
1770 | } | |
b75a7d8f | 1771 | |
729e4ab9 A |
1772 | #ifdef DEBUG_FOR_CODE_POINTS |
1773 | UChar32 cp; | |
1774 | U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp); | |
1775 | fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength); | |
1776 | #endif | |
46f4442e A |
1777 | //key = newCharsLen << 24 | charsOffset; |
1778 | UColToken key; | |
1779 | key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; | |
729e4ab9 | 1780 | key.rulesToParseHdl = &(src->source); |
b75a7d8f | 1781 | |
46f4442e A |
1782 | /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ |
1783 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
b75a7d8f | 1784 | |
46f4442e A |
1785 | if(src->parsedToken.strength != UCOL_TOK_RESET) { |
1786 | if(lastToken == NULL) { /* this means that rules haven't started properly */ | |
1787 | *status = U_INVALID_FORMAT_ERROR; | |
1788 | syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); | |
729e4ab9 | 1789 | DBG_FORMAT_ERROR |
46f4442e A |
1790 | return 0; |
1791 | } | |
1792 | /* 6 Otherwise (when relation != reset) */ | |
1793 | if(sourceToken == NULL) { | |
1794 | /* If sourceToken is null, create new one, */ | |
1795 | sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); | |
1796 | /* test for NULL */ | |
1797 | if (sourceToken == NULL) { | |
1798 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1799 | return 0; | |
1800 | } | |
729e4ab9 | 1801 | sourceToken->rulesToParseHdl = &(src->source); |
46f4442e A |
1802 | sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
1803 | ||
1804 | sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); | |
1805 | ||
1806 | sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset; | |
1807 | sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset); | |
1808 | ||
1809 | sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ | |
1810 | sourceToken->next = NULL; | |
1811 | sourceToken->previous = NULL; | |
1812 | sourceToken->noOfCEs = 0; | |
1813 | sourceToken->noOfExpCEs = 0; | |
1814 | // keep the flags around so that we know about before | |
1815 | sourceToken->flags = src->parsedToken.flags; | |
1816 | uhash_put(src->tailored, sourceToken, sourceToken, status); | |
1817 | if(U_FAILURE(*status)) { | |
1818 | return 0; | |
1819 | } | |
1820 | } else { | |
1821 | /* we could have fished out a reset here */ | |
1822 | if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { | |
1823 | /* otherwise remove sourceToken from where it was. */ | |
1824 | if(sourceToken->next != NULL) { | |
1825 | if(sourceToken->next->strength > sourceToken->strength) { | |
1826 | sourceToken->next->strength = sourceToken->strength; | |
1827 | } | |
1828 | sourceToken->next->previous = sourceToken->previous; | |
1829 | } else { | |
1830 | sourceToken->listHeader->last = sourceToken->previous; | |
1831 | } | |
1832 | ||
1833 | if(sourceToken->previous != NULL) { | |
1834 | sourceToken->previous->next = sourceToken->next; | |
1835 | } else { | |
1836 | sourceToken->listHeader->first = sourceToken->next; | |
1837 | } | |
1838 | sourceToken->next = NULL; | |
1839 | sourceToken->previous = NULL; | |
1840 | } | |
1841 | } | |
b75a7d8f | 1842 | |
46f4442e A |
1843 | sourceToken->strength = src->parsedToken.strength; |
1844 | sourceToken->listHeader = lastToken->listHeader; | |
1845 | ||
1846 | /* | |
1847 | 1. Find the strongest strength in each list, and set strongestP and strongestN | |
1848 | accordingly in the headers. | |
1849 | */ | |
1850 | if(lastStrength == UCOL_TOK_RESET | |
1851 | || sourceToken->listHeader->first == 0) { | |
1852 | /* If LAST is a reset | |
1853 | insert sourceToken in the list. */ | |
1854 | if(sourceToken->listHeader->first == 0) { | |
1855 | sourceToken->listHeader->first = sourceToken; | |
1856 | sourceToken->listHeader->last = sourceToken; | |
1857 | } else { /* we need to find a place for us */ | |
1858 | /* and we'll get in front of the same strength */ | |
1859 | if(sourceToken->listHeader->first->strength <= sourceToken->strength) { | |
1860 | sourceToken->next = sourceToken->listHeader->first; | |
1861 | sourceToken->next->previous = sourceToken; | |
1862 | sourceToken->listHeader->first = sourceToken; | |
1863 | sourceToken->previous = NULL; | |
1864 | } else { | |
1865 | lastToken = sourceToken->listHeader->first; | |
1866 | while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { | |
1867 | lastToken = lastToken->next; | |
1868 | } | |
1869 | if(lastToken->next != NULL) { | |
1870 | lastToken->next->previous = sourceToken; | |
1871 | } else { | |
1872 | sourceToken->listHeader->last = sourceToken; | |
1873 | } | |
1874 | sourceToken->previous = lastToken; | |
1875 | sourceToken->next = lastToken->next; | |
1876 | lastToken->next = sourceToken; | |
1877 | } | |
1878 | } | |
1879 | } else { | |
1880 | /* Otherwise (when LAST is not a reset) | |
1881 | if polarity (LAST) == polarity(relation), insert sourceToken after LAST, | |
1882 | otherwise insert before. | |
1883 | when inserting after or before, search to the next position with the same | |
1884 | strength in that direction. (This is called postpone insertion). */ | |
1885 | if(sourceToken != lastToken) { | |
1886 | if(lastToken->polarity == sourceToken->polarity) { | |
1887 | while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { | |
1888 | lastToken = lastToken->next; | |
1889 | } | |
1890 | sourceToken->previous = lastToken; | |
1891 | if(lastToken->next != NULL) { | |
1892 | lastToken->next->previous = sourceToken; | |
1893 | } else { | |
1894 | sourceToken->listHeader->last = sourceToken; | |
1895 | } | |
1896 | ||
1897 | sourceToken->next = lastToken->next; | |
1898 | lastToken->next = sourceToken; | |
1899 | } else { | |
1900 | while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { | |
1901 | lastToken = lastToken->previous; | |
1902 | } | |
1903 | sourceToken->next = lastToken; | |
1904 | if(lastToken->previous != NULL) { | |
1905 | lastToken->previous->next = sourceToken; | |
1906 | } else { | |
1907 | sourceToken->listHeader->first = sourceToken; | |
1908 | } | |
1909 | sourceToken->previous = lastToken->previous; | |
1910 | lastToken->previous = sourceToken; | |
1911 | } | |
1912 | } else { /* repeated one thing twice in rules, stay with the stronger strength */ | |
1913 | if(lastStrength < sourceToken->strength) { | |
1914 | sourceToken->strength = lastStrength; | |
1915 | } | |
1916 | } | |
1917 | } | |
b75a7d8f | 1918 | |
46f4442e A |
1919 | /* if the token was a variable top, we're gonna put it in */ |
1920 | if(variableTop == TRUE && src->varTop == NULL) { | |
1921 | variableTop = FALSE; | |
1922 | src->varTop = sourceToken; | |
1923 | } | |
1924 | ||
1925 | // Treat the expansions. | |
1926 | // There are two types of expansions: explicit (x / y) and reset based propagating expansions | |
1927 | // (&abc * d * e <=> &ab * d / c * e / c) | |
1928 | // if both of them are in effect for a token, they are combined. | |
1929 | ||
1930 | sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; | |
1931 | ||
1932 | if(expandNext != 0) { | |
1933 | if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ | |
1934 | expandNext = 0; | |
1935 | } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ | |
1936 | sourceToken->expansion = expandNext; | |
1937 | } else { /* there is both explicit and implicit expansion. We need to make a combination */ | |
1938 | uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); | |
1939 | uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar)); | |
1940 | sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source)); | |
1941 | src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen; | |
1942 | } | |
1943 | } | |
1944 | ||
1945 | // This is just for debugging purposes | |
1946 | if(sourceToken->expansion != 0) { | |
1947 | sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); | |
1948 | } else { | |
1949 | sourceToken->debugExpansion = 0; | |
1950 | } | |
1951 | // if the previous token was a reset before, the strength of this | |
1952 | // token must match the strength of before. Otherwise we have an | |
1953 | // undefined situation. | |
1954 | // In other words, we currently have a cludge which we use to | |
1955 | // represent &a >> x. This is written as &[before 2]a << x. | |
1956 | if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { | |
1957 | uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1; | |
1958 | if(beforeStrength != sourceToken->strength) { | |
1959 | *status = U_INVALID_FORMAT_ERROR; | |
1960 | syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); | |
729e4ab9 | 1961 | DBG_FORMAT_ERROR |
46f4442e A |
1962 | return 0; |
1963 | } | |
1964 | } | |
b75a7d8f | 1965 | } else { |
46f4442e A |
1966 | if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { |
1967 | /* if the previous token was also a reset, */ | |
1968 | /*this means that we have two consecutive resets */ | |
1969 | /* and we want to remove the previous one if empty*/ | |
1970 | if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { | |
1971 | src->resultLen--; | |
1972 | } | |
1973 | } | |
b75a7d8f | 1974 | |
46f4442e A |
1975 | if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ |
1976 | uint32_t searchCharsLen = src->parsedToken.charsLen; | |
1977 | while(searchCharsLen > 1 && sourceToken == NULL) { | |
1978 | searchCharsLen--; | |
1979 | //key = searchCharsLen << 24 | charsOffset; | |
1980 | UColToken key; | |
1981 | key.source = searchCharsLen << 24 | src->parsedToken.charsOffset; | |
729e4ab9 | 1982 | key.rulesToParseHdl = &(src->source); |
46f4442e A |
1983 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
1984 | } | |
1985 | if(sourceToken != NULL) { | |
1986 | expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen); | |
1987 | } | |
1988 | } | |
b75a7d8f | 1989 | |
46f4442e A |
1990 | if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ |
1991 | if(top == FALSE) { /* there is no indirection */ | |
1992 | uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; | |
1993 | if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { | |
1994 | /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ | |
1995 | while(sourceToken->strength > strength && sourceToken->previous != NULL) { | |
1996 | sourceToken = sourceToken->previous; | |
1997 | } | |
1998 | /* here, either we hit the strength or NULL */ | |
1999 | if(sourceToken->strength == strength) { | |
2000 | if(sourceToken->previous != NULL) { | |
2001 | sourceToken = sourceToken->previous; | |
2002 | } else { /* start of list */ | |
2003 | sourceToken = sourceToken->listHeader->reset; | |
2004 | } | |
2005 | } else { /* we hit NULL */ | |
2006 | /* we should be doing the else part */ | |
2007 | sourceToken = sourceToken->listHeader->reset; | |
2008 | sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); | |
2009 | } | |
2010 | } else { | |
2011 | sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); | |
2012 | } | |
2013 | } else { /* this is both before and indirection */ | |
2014 | top = FALSE; | |
2015 | ListList[src->resultLen].previousCE = 0; | |
2016 | ListList[src->resultLen].previousContCE = 0; | |
2017 | ListList[src->resultLen].indirect = TRUE; | |
2018 | /* we need to do slightly more work. we need to get the baseCE using the */ | |
2019 | /* inverse UCA & getPrevious. The next bound is not set, and will be decided */ | |
2020 | /* in ucol_bld */ | |
2021 | uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; | |
2022 | uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; | |
2023 | uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F; | |
2024 | uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; | |
2025 | ||
2026 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); | |
729e4ab9 A |
2027 | if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && |
2028 | (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ | |
2029 | uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); | |
46f4442e A |
2030 | uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
2031 | uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); | |
729e4ab9 A |
2032 | CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
2033 | SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; | |
46f4442e A |
2034 | } else { |
2035 | /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/ | |
2036 | ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); | |
2037 | } | |
2038 | ||
2039 | ListList[src->resultLen].baseCE = CE; | |
2040 | ListList[src->resultLen].baseContCE = SecondCE; | |
2041 | ListList[src->resultLen].nextCE = 0; | |
2042 | ListList[src->resultLen].nextContCE = 0; | |
2043 | ||
2044 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); | |
2045 | } | |
2046 | } | |
b75a7d8f | 2047 | |
b75a7d8f | 2048 | |
46f4442e A |
2049 | /* 5 If the relation is a reset: |
2050 | If sourceToken is null | |
2051 | Create new list, create new sourceToken, make the baseCE from source, put | |
2052 | the sourceToken in ListHeader of the new list */ | |
2053 | if(sourceToken == NULL) { | |
2054 | /* | |
2055 | 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... | |
2056 | First convert all expansions into normal form. Examples: | |
2057 | If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * | |
2058 | d * ... into &x * c/y * d * ... | |
2059 | Note: reset values can never have expansions, although they can cause the | |
2060 | very next item to have one. They may be contractions, if they are found | |
2061 | earlier in the list. | |
2062 | */ | |
2063 | if(top == FALSE) { | |
2064 | collIterate s; | |
2065 | uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; | |
2066 | ||
729e4ab9 | 2067 | uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status); |
46f4442e A |
2068 | |
2069 | CE = ucol_getNextCE(src->UCA, &s, status); | |
729e4ab9 | 2070 | const UChar *expand = s.pos; |
46f4442e A |
2071 | SecondCE = ucol_getNextCE(src->UCA, &s, status); |
2072 | ||
2073 | ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; | |
2074 | if(isContinuation(SecondCE)) { | |
2075 | ListList[src->resultLen].baseContCE = SecondCE; | |
2076 | } else { | |
2077 | ListList[src->resultLen].baseContCE = 0; | |
2078 | } | |
2079 | ListList[src->resultLen].nextCE = 0; | |
2080 | ListList[src->resultLen].nextContCE = 0; | |
2081 | ListList[src->resultLen].previousCE = 0; | |
2082 | ListList[src->resultLen].previousContCE = 0; | |
2083 | ListList[src->resultLen].indirect = FALSE; | |
2084 | sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); | |
2085 | } else { /* top == TRUE */ | |
2086 | /* just use the supplied values */ | |
2087 | top = FALSE; | |
2088 | ListList[src->resultLen].previousCE = 0; | |
2089 | ListList[src->resultLen].previousContCE = 0; | |
2090 | ListList[src->resultLen].indirect = TRUE; | |
2091 | ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; | |
2092 | ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE; | |
2093 | ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE; | |
2094 | ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE; | |
2095 | ||
2096 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); | |
b75a7d8f | 2097 | |
46f4442e A |
2098 | } |
2099 | } else { /* reset to something already in rules */ | |
2100 | top = FALSE; | |
374ca955 | 2101 | } |
b75a7d8f | 2102 | } |
46f4442e A |
2103 | /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ |
2104 | lastToken = sourceToken; | |
2105 | } else { | |
2106 | if(U_FAILURE(*status)) { | |
2107 | return 0; | |
374ca955 | 2108 | } |
b75a7d8f | 2109 | } |
46f4442e | 2110 | } |
729e4ab9 A |
2111 | #ifdef DEBUG_FOR_CODE_POINTS |
2112 | fclose(dfcp_fp); | |
2113 | #endif | |
2114 | ||
b75a7d8f | 2115 | |
46f4442e A |
2116 | if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { |
2117 | src->resultLen--; | |
2118 | } | |
2119 | return src->resultLen; | |
2120 | } | |
b75a7d8f | 2121 | |
729e4ab9 A |
2122 | const UChar* ucol_tok_getRulesFromBundle( |
2123 | void* /*context*/, | |
2124 | const char* locale, | |
2125 | const char* type, | |
2126 | int32_t* pLength, | |
2127 | UErrorCode* status) | |
2128 | { | |
2129 | const UChar* rules = NULL; | |
2130 | UResourceBundle* bundle; | |
2131 | UResourceBundle* collations; | |
2132 | UResourceBundle* collation; | |
2133 | ||
2134 | *pLength = 0; | |
2135 | ||
2136 | bundle = ures_open(U_ICUDATA_COLL, locale, status); | |
2137 | if(U_SUCCESS(*status)){ | |
2138 | collations = ures_getByKey(bundle, "collations", NULL, status); | |
2139 | if(U_SUCCESS(*status)){ | |
2140 | collation = ures_getByKey(collations, type, NULL, status); | |
2141 | if(U_SUCCESS(*status)){ | |
2142 | rules = ures_getStringByKey(collation, "Sequence", pLength, status); | |
2143 | if(U_FAILURE(*status)){ | |
2144 | *pLength = 0; | |
2145 | rules = NULL; | |
2146 | } | |
2147 | ures_close(collation); | |
2148 | } | |
2149 | ures_close(collations); | |
2150 | } | |
2151 | } | |
2152 | ||
2153 | ures_close(bundle); | |
2154 | ||
2155 | return rules; | |
2156 | } | |
2157 | ||
2158 | void ucol_tok_initTokenList( | |
2159 | UColTokenParser *src, | |
2160 | const UChar *rules, | |
2161 | uint32_t rulesLength, | |
2162 | const UCollator *UCA, | |
2163 | GetCollationRulesFunction importFunc, | |
2164 | void* context, | |
2165 | UErrorCode *status) { | |
46f4442e | 2166 | U_NAMESPACE_USE |
b75a7d8f | 2167 | |
46f4442e A |
2168 | uint32_t nSize = 0; |
2169 | uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); | |
729e4ab9 A |
2170 | |
2171 | bool needToDeallocRules = false; | |
2172 | ||
46f4442e A |
2173 | if(U_FAILURE(*status)) { |
2174 | return; | |
2175 | } | |
b75a7d8f | 2176 | |
46f4442e A |
2177 | // set everything to zero, so that we can clean up gracefully |
2178 | uprv_memset(src, 0, sizeof(UColTokenParser)); | |
2179 | ||
2180 | // first we need to find options that don't like to be normalized, | |
2181 | // like copy and remove... | |
2182 | //const UChar *openBrace = rules; | |
2183 | int32_t optionNumber = -1; | |
729e4ab9 | 2184 | const UChar *setStart = NULL; |
46f4442e A |
2185 | uint32_t i = 0; |
2186 | while(i < rulesLength) { | |
729e4ab9 A |
2187 | if(rules[i] == 0x005B) { // '[': start of an option |
2188 | /* Gets the following: | |
2189 | optionNumber: The index of the option. | |
2190 | setStart: The pointer at which the option arguments start. | |
2191 | */ | |
46f4442e | 2192 | optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart); |
729e4ab9 | 2193 | |
46f4442e | 2194 | if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */ |
729e4ab9 | 2195 | // [optimize] |
46f4442e A |
2196 | USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); |
2197 | if(U_SUCCESS(*status)) { | |
2198 | if(src->copySet == NULL) { | |
2199 | src->copySet = newSet; | |
2200 | } else { | |
2201 | uset_addAll(src->copySet, newSet); | |
2202 | uset_close(newSet); | |
2203 | } | |
2204 | } else { | |
2205 | return; | |
2206 | } | |
2207 | } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { | |
2208 | USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); | |
2209 | if(U_SUCCESS(*status)) { | |
2210 | if(src->removeSet == NULL) { | |
2211 | src->removeSet = newSet; | |
2212 | } else { | |
2213 | uset_addAll(src->removeSet, newSet); | |
2214 | uset_close(newSet); | |
2215 | } | |
2216 | } else { | |
2217 | return; | |
2218 | } | |
729e4ab9 A |
2219 | } else if(optionNumber == OPTION_IMPORT){ |
2220 | // [import <collation-name>] | |
2221 | ||
2222 | // Find the address of the closing ]. | |
2223 | UChar* import_end = u_strchr(setStart, 0x005D); | |
2224 | int32_t optionEndOffset = (int32_t)(import_end + 1 - rules); | |
2225 | // Ignore trailing whitespace. | |
2226 | while(uprv_isRuleWhiteSpace(*(import_end-1))) { | |
2227 | --import_end; | |
2228 | } | |
2229 | ||
2230 | int32_t optionLength = (int32_t)(import_end - setStart); | |
2231 | char option[50]; | |
2232 | if(optionLength >= (int32_t)sizeof(option)) { | |
2233 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
2234 | return; | |
2235 | } | |
2236 | u_UCharsToChars(setStart, option, optionLength); | |
2237 | option[optionLength] = 0; | |
2238 | ||
2239 | *status = U_ZERO_ERROR; | |
2240 | char locale[50]; | |
2241 | int32_t templ; | |
2242 | uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status); | |
2243 | if(U_FAILURE(*status)) { | |
2244 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
2245 | return; | |
2246 | } | |
2247 | ||
2248 | char type[50]; | |
2249 | if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 || | |
2250 | U_FAILURE(*status) | |
2251 | ) { | |
2252 | *status = U_ZERO_ERROR; | |
2253 | uprv_strcpy(type, "standard"); | |
2254 | } | |
2255 | ||
2256 | // TODO: Use public functions when available, see ticket #8134. | |
2257 | char *keywords = (char *)locale_getKeywordsStart(locale); | |
2258 | if(keywords != NULL) { | |
2259 | *keywords = 0; | |
2260 | } | |
2261 | ||
2262 | int32_t importRulesLength = 0; | |
2263 | const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status); | |
2264 | ||
2265 | #ifdef DEBUG_FOR_COLL_RULES | |
2266 | std::string s; | |
2267 | UnicodeString(importRules).toUTF8String(s); | |
2268 | std::cout << "Import rules = " << s << std::endl; | |
2269 | #endif | |
2270 | ||
2271 | // Add the length of the imported rules to length of the original rules, | |
2272 | // and subtract the length of the import option. | |
2273 | uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i); | |
2274 | ||
2275 | UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar)); | |
2276 | ||
2277 | #ifdef DEBUG_FOR_COLL_RULES | |
2278 | std::string s1; | |
2279 | UnicodeString(rules).toUTF8String(s1); | |
2280 | std::cout << "Original rules = " << s1 << std::endl; | |
2281 | #endif | |
2282 | ||
2283 | ||
2284 | // Copy the section of the original rules leading up to the import | |
2285 | uprv_memcpy(newRules, rules, i*sizeof(UChar)); | |
2286 | // Copy the imported rules | |
2287 | uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar)); | |
2288 | // Copy the rest of the original rules (minus the import option itself) | |
2289 | uprv_memcpy(newRules+i+importRulesLength, | |
2290 | rules+optionEndOffset, | |
2291 | (rulesLength-optionEndOffset)*sizeof(UChar)); | |
2292 | ||
2293 | #ifdef DEBUG_FOR_COLL_RULES | |
2294 | std::string s2; | |
2295 | UnicodeString(newRules).toUTF8String(s2); | |
2296 | std::cout << "Resulting rules = " << s2 << std::endl; | |
2297 | #endif | |
2298 | ||
2299 | if(needToDeallocRules){ | |
2300 | // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free | |
2301 | uprv_free((void*)rules); | |
2302 | } | |
2303 | needToDeallocRules = true; | |
2304 | rules = newRules; | |
2305 | rulesLength = newRulesLength; | |
2306 | ||
2307 | estimatedSize += importRulesLength*2; | |
2308 | ||
2309 | // First character of the new rules needs to be processed | |
2310 | i--; | |
46f4442e | 2311 | } |
b75a7d8f | 2312 | } |
46f4442e A |
2313 | //openBrace++; |
2314 | i++; | |
b75a7d8f | 2315 | } |
b75a7d8f | 2316 | |
46f4442e A |
2317 | src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); |
2318 | /* test for NULL */ | |
2319 | if (src->source == NULL) { | |
2320 | *status = U_MEMORY_ALLOCATION_ERROR; | |
2321 | return; | |
2322 | } | |
2323 | uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); | |
2324 | nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status); | |
2325 | if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { | |
2326 | *status = U_ZERO_ERROR; | |
2327 | src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); | |
2328 | /* test for NULL */ | |
2329 | if (src->source == NULL) { | |
2330 | *status = U_MEMORY_ALLOCATION_ERROR; | |
2331 | return; | |
b75a7d8f | 2332 | } |
46f4442e A |
2333 | nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); |
2334 | } | |
729e4ab9 A |
2335 | if(needToDeallocRules){ |
2336 | // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free | |
2337 | uprv_free((void*)rules); | |
2338 | } | |
2339 | ||
2340 | ||
46f4442e A |
2341 | src->current = src->source; |
2342 | src->end = src->source+nSize; | |
2343 | src->sourceCurrent = src->source; | |
2344 | src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly | |
2345 | src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; | |
2346 | src->varTop = NULL; | |
2347 | src->UCA = UCA; | |
2348 | src->invUCA = ucol_initInverseUCA(status); | |
2349 | src->parsedToken.charsLen = 0; | |
2350 | src->parsedToken.charsOffset = 0; | |
2351 | src->parsedToken.extensionLen = 0; | |
2352 | src->parsedToken.extensionOffset = 0; | |
2353 | src->parsedToken.prefixLen = 0; | |
2354 | src->parsedToken.prefixOffset = 0; | |
2355 | src->parsedToken.flags = 0; | |
2356 | src->parsedToken.strength = UCOL_TOK_UNSET; | |
2357 | src->buildCCTabFlag = FALSE; | |
729e4ab9 A |
2358 | src->isStarred = FALSE; |
2359 | src->inRange = FALSE; | |
2360 | src->lastRangeCp = 0; | |
2361 | src->previousCp = 0; | |
46f4442e A |
2362 | |
2363 | if(U_FAILURE(*status)) { | |
2364 | return; | |
b75a7d8f | 2365 | } |
46f4442e A |
2366 | src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status); |
2367 | if(U_FAILURE(*status)) { | |
2368 | return; | |
2369 | } | |
2370 | uhash_setValueDeleter(src->tailored, uhash_freeBlock); | |
2371 | ||
2372 | src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); | |
b75a7d8f | 2373 | /* test for NULL */ |
46f4442e | 2374 | if (src->opts == NULL) { |
b75a7d8f A |
2375 | *status = U_MEMORY_ALLOCATION_ERROR; |
2376 | return; | |
2377 | } | |
46f4442e A |
2378 | |
2379 | uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); | |
2380 | ||
46f4442e A |
2381 | src->lh = 0; |
2382 | src->listCapacity = 1024; | |
2383 | src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader)); | |
2384 | //Test for NULL | |
2385 | if (src->lh == NULL) { | |
2386 | *status = U_MEMORY_ALLOCATION_ERROR; | |
2387 | return; | |
2388 | } | |
2389 | uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); | |
2390 | src->resultLen = 0; | |
2391 | ||
2392 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); | |
2393 | ||
2394 | // UCOL_RESET_TOP_VALUE | |
2395 | setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); | |
2396 | // UCOL_FIRST_PRIMARY_IGNORABLE | |
2397 | setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); | |
2398 | // UCOL_LAST_PRIMARY_IGNORABLE | |
2399 | setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); | |
2400 | // UCOL_FIRST_SECONDARY_IGNORABLE | |
2401 | setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); | |
2402 | // UCOL_LAST_SECONDARY_IGNORABLE | |
2403 | setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); | |
2404 | // UCOL_FIRST_TERTIARY_IGNORABLE | |
2405 | setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); | |
2406 | // UCOL_LAST_TERTIARY_IGNORABLE | |
2407 | setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); | |
2408 | // UCOL_FIRST_VARIABLE | |
2409 | setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); | |
2410 | // UCOL_LAST_VARIABLE | |
2411 | setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); | |
2412 | // UCOL_FIRST_NON_VARIABLE | |
2413 | setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); | |
2414 | // UCOL_LAST_NON_VARIABLE | |
2415 | setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); | |
2416 | // UCOL_FIRST_IMPLICIT | |
2417 | setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); | |
2418 | // UCOL_LAST_IMPLICIT | |
2419 | setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING); | |
2420 | // UCOL_FIRST_TRAILING | |
2421 | setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); | |
2422 | // UCOL_LAST_TRAILING | |
2423 | setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); | |
2424 | ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); | |
b75a7d8f A |
2425 | } |
2426 | ||
2427 | ||
2428 | void ucol_tok_closeTokenList(UColTokenParser *src) { | |
46f4442e A |
2429 | if(src->copySet != NULL) { |
2430 | uset_close(src->copySet); | |
2431 | } | |
2432 | if(src->removeSet != NULL) { | |
2433 | uset_close(src->removeSet); | |
2434 | } | |
2435 | if(src->tailored != NULL) { | |
2436 | uhash_close(src->tailored); | |
2437 | } | |
2438 | if(src->lh != NULL) { | |
2439 | uprv_free(src->lh); | |
2440 | } | |
2441 | if(src->source != NULL) { | |
2442 | uprv_free(src->source); | |
2443 | } | |
2444 | if(src->opts != NULL) { | |
2445 | uprv_free(src->opts); | |
2446 | } | |
729e4ab9 A |
2447 | if (src->reorderCodes != NULL) { |
2448 | uprv_free(src->reorderCodes); | |
2449 | } | |
b75a7d8f A |
2450 | } |
2451 | ||
2452 | #endif /* #if !UCONFIG_NO_COLLATION */ |