]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
374ca955 | 4 | * Copyright (C) 2001-2004, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: ucol_tok.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created 02/22/2001 | |
14 | * created by: Vladimir Weinstein | |
15 | * | |
374ca955 | 16 | * This module reads a tailoring rule string and produces a list of |
b75a7d8f | 17 | * tokens that will be turned into collation elements |
374ca955 | 18 | * |
b75a7d8f A |
19 | */ |
20 | ||
21 | #include "unicode/utypes.h" | |
22 | ||
23 | #if !UCONFIG_NO_COLLATION | |
24 | ||
25 | #include "unicode/ustring.h" | |
26 | #include "unicode/uchar.h" | |
27 | #include "unicode/uniset.h" | |
374ca955 | 28 | |
b75a7d8f A |
29 | #include "ucol_tok.h" |
30 | #include "cmemory.h" | |
374ca955 | 31 | #include "util.h" |
b75a7d8f A |
32 | |
33 | U_CDECL_BEGIN | |
34 | static int32_t U_EXPORT2 U_CALLCONV | |
35 | uhash_hashTokens(const UHashTok k) | |
36 | { | |
37 | int32_t hash = 0; | |
38 | //uint32_t key = (uint32_t)k.integer; | |
39 | UColToken *key = (UColToken *)k.pointer; | |
40 | if (key != 0) { | |
41 | //int32_t len = (key & 0xFF000000)>>24; | |
42 | int32_t len = (key->source & 0xFF000000)>>24; | |
43 | int32_t inc = ((len - 32) / 32) + 1; | |
374ca955 | 44 | |
b75a7d8f A |
45 | //const UChar *p = (key & 0x00FFFFFF) + rulesToParse; |
46 | const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse; | |
374ca955 | 47 | const UChar *limit = p + len; |
b75a7d8f A |
48 | |
49 | while (p<limit) { | |
50 | hash = (hash * 37) + *p; | |
51 | p += inc; | |
52 | } | |
53 | } | |
54 | return hash; | |
55 | } | |
56 | ||
57 | static UBool U_EXPORT2 U_CALLCONV | |
58 | uhash_compareTokens(const UHashTok key1, const UHashTok key2) | |
59 | { | |
60 | //uint32_t p1 = (uint32_t) key1.integer; | |
61 | //uint32_t p2 = (uint32_t) key2.integer; | |
62 | UColToken *p1 = (UColToken *)key1.pointer; | |
63 | UColToken *p2 = (UColToken *)key2.pointer; | |
64 | const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse; | |
65 | const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse; | |
66 | uint32_t s1L = ((p1->source & 0xFF000000) >> 24); | |
67 | uint32_t s2L = ((p2->source & 0xFF000000) >> 24); | |
68 | const UChar *end = s1+s1L-1; | |
69 | ||
70 | if (p1 == p2) { | |
71 | return TRUE; | |
72 | } | |
73 | if (p1->source == 0 || p2->source == 0) { | |
74 | return FALSE; | |
75 | } | |
76 | if(s1L != s2L) { | |
77 | return FALSE; | |
78 | } | |
79 | if(p1->source == p2->source) { | |
80 | return TRUE; | |
81 | } | |
82 | while((s1 < end) && *s1 == *s2) { | |
83 | ++s1; | |
84 | ++s2; | |
85 | } | |
86 | if(*s1 == *s2) { | |
87 | return TRUE; | |
88 | } else { | |
89 | return FALSE; | |
90 | } | |
91 | } | |
92 | U_CDECL_END | |
93 | ||
94 | static inline void U_CALLCONV | |
95 | uhash_freeBlockWrapper(void *obj) { | |
96 | uhash_freeBlock(obj); | |
97 | } | |
98 | ||
99 | ||
100 | typedef struct { | |
101 | uint32_t startCE; | |
102 | uint32_t startContCE; | |
103 | uint32_t limitCE; | |
104 | uint32_t limitContCE; | |
105 | } indirectBoundaries; | |
106 | ||
107 | /* these values are used for finding CE values for indirect positioning. */ | |
108 | /* Indirect positioning is a mechanism for allowing resets on symbolic */ | |
109 | /* values. It only works for resets and you cannot tailor indirect names */ | |
110 | /* An indirect name can define either an anchor point or a range. An */ | |
111 | /* anchor point behaves in exactly the same way as a code point in reset */ | |
112 | /* would, except that it cannot be tailored. A range (we currently only */ | |
113 | /* know for the [top] range will explicitly set the upper bound for */ | |
114 | /* generated CEs, thus allowing for better control over how many CEs can */ | |
115 | /* be squeezed between in the range without performance penalty. */ | |
116 | /* In that respect, we use [top] for tailoring of locales that use CJK */ | |
117 | /* characters. Other indirect values are currently a pure convenience, */ | |
118 | /* they can be used to assure that the CEs will be always positioned in */ | |
119 | /* the same place relative to a point with known properties (e.g. first */ | |
120 | /* primary ignorable). */ | |
121 | static indirectBoundaries ucolIndirectBoundaries[15]; | |
122 | /* | |
123 | static indirectBoundaries ucolIndirectBoundaries[11] = { | |
374ca955 | 124 | { UCOL_RESET_TOP_VALUE, 0, |
b75a7d8f | 125 | UCOL_NEXT_TOP_VALUE, 0 }, |
374ca955 | 126 | { UCOL_FIRST_PRIMARY_IGNORABLE, 0, |
b75a7d8f | 127 | 0, 0 }, |
374ca955 | 128 | { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, |
b75a7d8f | 129 | 0, 0 }, |
374ca955 | 130 | { UCOL_FIRST_SECONDARY_IGNORABLE, 0, |
b75a7d8f | 131 | 0, 0 }, |
374ca955 | 132 | { UCOL_LAST_SECONDARY_IGNORABLE, 0, |
b75a7d8f | 133 | 0, 0 }, |
374ca955 | 134 | { UCOL_FIRST_TERTIARY_IGNORABLE, 0, |
b75a7d8f | 135 | 0, 0 }, |
374ca955 | 136 | { UCOL_LAST_TERTIARY_IGNORABLE, 0, |
b75a7d8f | 137 | 0, 0 }, |
374ca955 | 138 | { UCOL_FIRST_VARIABLE, 0, |
b75a7d8f | 139 | 0, 0 }, |
374ca955 | 140 | { UCOL_LAST_VARIABLE, 0, |
b75a7d8f | 141 | 0, 0 }, |
374ca955 | 142 | { UCOL_FIRST_NON_VARIABLE, 0, |
b75a7d8f | 143 | 0, 0 }, |
374ca955 | 144 | { UCOL_LAST_NON_VARIABLE, 0, |
b75a7d8f A |
145 | 0, 0 }, |
146 | }; | |
147 | */ | |
148 | ||
374ca955 A |
149 | static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) { |
150 | ||
b75a7d8f A |
151 | // Set values for the top - TODO: once we have values for all the indirects, we are going |
152 | // to initalize here. | |
153 | ucolIndirectBoundaries[indexR].startCE = start[0]; | |
154 | ucolIndirectBoundaries[indexR].startContCE = start[1]; | |
155 | if(end) { | |
156 | ucolIndirectBoundaries[indexR].limitCE = end[0]; | |
157 | ucolIndirectBoundaries[indexR].limitContCE = end[1]; | |
158 | } else { | |
159 | ucolIndirectBoundaries[indexR].limitCE = 0; | |
160 | ucolIndirectBoundaries[indexR].limitContCE = 0; | |
161 | } | |
162 | } | |
163 | ||
164 | ||
374ca955 A |
165 | static inline |
166 | void syntaxError(const UChar* rules, | |
b75a7d8f A |
167 | int32_t pos, |
168 | int32_t rulesLen, | |
169 | UParseError* parseError) { | |
170 | parseError->offset = pos; | |
171 | parseError->line = 0 ; /* we are not using line numbers */ | |
374ca955 | 172 | |
b75a7d8f A |
173 | // for pre-context |
174 | int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); | |
175 | int32_t stop = pos; | |
374ca955 | 176 | |
b75a7d8f A |
177 | u_memcpy(parseError->preContext,rules+start,stop-start); |
178 | //null terminate the buffer | |
179 | parseError->preContext[stop-start] = 0; | |
374ca955 | 180 | |
b75a7d8f A |
181 | //for post-context |
182 | start = pos+1; | |
374ca955 A |
183 | stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : |
184 | rulesLen; | |
b75a7d8f | 185 | |
374ca955 A |
186 | if(start < stop) { |
187 | u_memcpy(parseError->postContext,rules+start,stop-start); | |
188 | //null terminate the buffer | |
189 | parseError->postContext[stop-start]= 0; | |
190 | } else { | |
191 | parseError->postContext[0] = 0; | |
192 | } | |
b75a7d8f A |
193 | } |
194 | ||
195 | static | |
196 | void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { | |
197 | switch(attrib) { | |
198 | case UCOL_HIRAGANA_QUATERNARY_MODE: | |
199 | opts->hiraganaQ = value; | |
200 | break; | |
201 | case UCOL_FRENCH_COLLATION: | |
202 | opts->frenchCollation = value; | |
203 | break; | |
204 | case UCOL_ALTERNATE_HANDLING: | |
205 | opts->alternateHandling = value; | |
206 | break; | |
207 | case UCOL_CASE_FIRST: | |
208 | opts->caseFirst = value; | |
209 | break; | |
210 | case UCOL_CASE_LEVEL: | |
211 | opts->caseLevel = value; | |
212 | break; | |
213 | case UCOL_NORMALIZATION_MODE: | |
214 | opts->normalizationMode = value; | |
215 | break; | |
216 | case UCOL_STRENGTH: | |
217 | opts->strength = value; | |
218 | break; | |
219 | case UCOL_NUMERIC_COLLATION: | |
374ca955 A |
220 | opts->numericCollation = value; |
221 | break; | |
b75a7d8f A |
222 | case UCOL_ATTRIBUTE_COUNT: |
223 | default: | |
224 | break; | |
225 | } | |
226 | } | |
227 | ||
228 | #define UTOK_OPTION_COUNT 20 | |
229 | ||
230 | static UBool didInit = FALSE; | |
231 | /* we can be strict, or we can be lenient */ | |
232 | /* I'd surely be lenient with the option arguments */ | |
233 | /* maybe even with options */ | |
234 | U_STRING_DECL(suboption_00, "non-ignorable", 13); | |
235 | U_STRING_DECL(suboption_01, "shifted", 7); | |
236 | ||
237 | U_STRING_DECL(suboption_02, "lower", 5); | |
238 | U_STRING_DECL(suboption_03, "upper", 5); | |
239 | U_STRING_DECL(suboption_04, "off", 3); | |
240 | U_STRING_DECL(suboption_05, "on", 2); | |
241 | U_STRING_DECL(suboption_06, "1", 1); | |
242 | U_STRING_DECL(suboption_07, "2", 1); | |
243 | U_STRING_DECL(suboption_08, "3", 1); | |
244 | U_STRING_DECL(suboption_09, "4", 1); | |
245 | U_STRING_DECL(suboption_10, "I", 1); | |
246 | ||
247 | U_STRING_DECL(suboption_11, "primary", 7); | |
248 | U_STRING_DECL(suboption_12, "secondary", 9); | |
249 | U_STRING_DECL(suboption_13, "tertiary", 8); | |
250 | U_STRING_DECL(suboption_14, "variable", 8); | |
251 | U_STRING_DECL(suboption_15, "regular", 7); | |
252 | U_STRING_DECL(suboption_16, "implicit", 8); | |
253 | U_STRING_DECL(suboption_17, "trailing", 8); | |
254 | ||
255 | ||
256 | U_STRING_DECL(option_00, "undefined", 9); | |
374ca955 | 257 | U_STRING_DECL(option_01, "rearrange", 9); |
b75a7d8f | 258 | U_STRING_DECL(option_02, "alternate", 9); |
374ca955 A |
259 | U_STRING_DECL(option_03, "backwards", 9); |
260 | U_STRING_DECL(option_04, "variable top", 12); | |
261 | U_STRING_DECL(option_05, "top", 3); | |
262 | U_STRING_DECL(option_06, "normalization", 13); | |
263 | U_STRING_DECL(option_07, "caseLevel", 9); | |
264 | U_STRING_DECL(option_08, "caseFirst", 9); | |
265 | U_STRING_DECL(option_09, "scriptOrder", 11); | |
266 | U_STRING_DECL(option_10, "charsetname", 11); | |
267 | U_STRING_DECL(option_11, "charset", 7); | |
268 | U_STRING_DECL(option_12, "before", 6); | |
b75a7d8f A |
269 | U_STRING_DECL(option_13, "hiraganaQ", 9); |
270 | U_STRING_DECL(option_14, "strength", 8); | |
271 | U_STRING_DECL(option_15, "first", 5); | |
272 | U_STRING_DECL(option_16, "last", 4); | |
273 | U_STRING_DECL(option_17, "optimize", 8); | |
274 | U_STRING_DECL(option_18, "suppressContractions", 20); | |
374ca955 | 275 | U_STRING_DECL(option_19, "numericOrdering", 15); |
b75a7d8f A |
276 | |
277 | ||
278 | /* | |
374ca955 A |
279 | [last variable] last variable value |
280 | [last primary ignorable] largest CE for primary ignorable | |
281 | [last secondary ignorable] largest CE for secondary ignorable | |
282 | [last tertiary ignorable] largest CE for tertiary ignorable | |
283 | [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) | |
b75a7d8f A |
284 | */ |
285 | ||
286 | ||
287 | static const ucolTokSuboption alternateSub[2] = { | |
288 | {suboption_00, 13, UCOL_NON_IGNORABLE}, | |
289 | {suboption_01, 7, UCOL_SHIFTED} | |
290 | }; | |
291 | ||
292 | static const ucolTokSuboption caseFirstSub[3] = { | |
293 | {suboption_02, 5, UCOL_LOWER_FIRST}, | |
294 | {suboption_03, 5, UCOL_UPPER_FIRST}, | |
295 | {suboption_04, 3, UCOL_OFF}, | |
296 | }; | |
297 | ||
298 | static const ucolTokSuboption onOffSub[2] = { | |
299 | {suboption_04, 3, UCOL_OFF}, | |
300 | {suboption_05, 2, UCOL_ON} | |
301 | }; | |
302 | ||
303 | static const ucolTokSuboption frenchSub[1] = { | |
304 | {suboption_07, 1, UCOL_ON} | |
305 | }; | |
306 | ||
307 | static const ucolTokSuboption beforeSub[3] = { | |
308 | {suboption_06, 1, UCOL_PRIMARY}, | |
309 | {suboption_07, 1, UCOL_SECONDARY}, | |
310 | {suboption_08, 1, UCOL_TERTIARY} | |
311 | }; | |
312 | ||
313 | static const ucolTokSuboption strengthSub[5] = { | |
314 | {suboption_06, 1, UCOL_PRIMARY}, | |
315 | {suboption_07, 1, UCOL_SECONDARY}, | |
316 | {suboption_08, 1, UCOL_TERTIARY}, | |
317 | {suboption_09, 1, UCOL_QUATERNARY}, | |
318 | {suboption_10, 1, UCOL_IDENTICAL}, | |
319 | }; | |
320 | ||
321 | static const ucolTokSuboption firstLastSub[7] = { | |
322 | {suboption_11, 7, UCOL_PRIMARY}, | |
323 | {suboption_12, 9, UCOL_PRIMARY}, | |
324 | {suboption_13, 8, UCOL_PRIMARY}, | |
325 | {suboption_14, 8, UCOL_PRIMARY}, | |
326 | {suboption_15, 7, UCOL_PRIMARY}, | |
327 | {suboption_16, 8, UCOL_PRIMARY}, | |
328 | {suboption_17, 8, UCOL_PRIMARY}, | |
329 | }; | |
330 | ||
331 | enum OptionNumber { | |
332 | OPTION_ALTERNATE_HANDLING = 0, | |
333 | OPTION_FRENCH_COLLATION, | |
334 | OPTION_CASE_LEVEL, | |
335 | OPTION_CASE_FIRST, | |
336 | OPTION_NORMALIZATION_MODE, | |
337 | OPTION_HIRAGANA_QUATERNARY, | |
338 | OPTION_STRENGTH, | |
339 | OPTION_NUMERIC_COLLATION, | |
340 | OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, | |
341 | OPTION_VARIABLE_TOP, | |
342 | OPTION_REARRANGE, | |
343 | OPTION_BEFORE, | |
344 | OPTION_TOP, | |
345 | OPTION_FIRST, | |
346 | OPTION_LAST, | |
347 | OPTION_OPTIMIZE, | |
348 | OPTION_SUPPRESS_CONTRACTIONS, | |
349 | OPTION_UNDEFINED, | |
350 | OPTION_SCRIPT_ORDER, | |
351 | OPTION_CHARSET_NAME, | |
352 | OPTION_CHARSET | |
353 | } ; | |
354 | ||
355 | static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { | |
356 | /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ | |
357 | /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ | |
358 | /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ | |
359 | /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ | |
360 | /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ | |
361 | /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */ | |
362 | /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ | |
374ca955 | 363 | /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/ |
b75a7d8f A |
364 | /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ |
365 | /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ | |
366 | /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ | |
367 | /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ | |
368 | /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ | |
369 | /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ | |
370 | /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ | |
371 | /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */ | |
372 | /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ | |
373 | /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ | |
374 | /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ | |
375 | /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */ | |
376 | }; | |
377 | ||
378 | static | |
374ca955 A |
379 | int32_t u_strncmpNoCase(const UChar *s1, |
380 | const UChar *s2, | |
381 | int32_t n) | |
b75a7d8f A |
382 | { |
383 | if(n > 0) { | |
384 | int32_t rc; | |
385 | for(;;) { | |
386 | rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); | |
387 | if(rc != 0 || *s1 == 0 || --n == 0) { | |
388 | return rc; | |
389 | } | |
390 | ++s1; | |
391 | ++s2; | |
392 | } | |
393 | } | |
394 | return 0; | |
395 | } | |
396 | ||
397 | static | |
398 | void ucol_uprv_tok_initData() { | |
399 | if(!didInit) { | |
400 | U_STRING_INIT(suboption_00, "non-ignorable", 13); | |
401 | U_STRING_INIT(suboption_01, "shifted", 7); | |
402 | ||
403 | U_STRING_INIT(suboption_02, "lower", 5); | |
404 | U_STRING_INIT(suboption_03, "upper", 5); | |
405 | U_STRING_INIT(suboption_04, "off", 3); | |
406 | U_STRING_INIT(suboption_05, "on", 2); | |
407 | ||
408 | U_STRING_INIT(suboption_06, "1", 1); | |
409 | U_STRING_INIT(suboption_07, "2", 1); | |
410 | U_STRING_INIT(suboption_08, "3", 1); | |
411 | U_STRING_INIT(suboption_09, "4", 1); | |
412 | U_STRING_INIT(suboption_10, "I", 1); | |
413 | ||
414 | U_STRING_INIT(suboption_11, "primary", 7); | |
415 | U_STRING_INIT(suboption_12, "secondary", 9); | |
416 | U_STRING_INIT(suboption_13, "tertiary", 8); | |
417 | U_STRING_INIT(suboption_14, "variable", 8); | |
418 | U_STRING_INIT(suboption_15, "regular", 7); | |
419 | U_STRING_INIT(suboption_16, "implicit", 8); | |
420 | U_STRING_INIT(suboption_17, "trailing", 8); | |
421 | ||
422 | ||
423 | U_STRING_INIT(option_00, "undefined", 9); | |
374ca955 | 424 | U_STRING_INIT(option_01, "rearrange", 9); |
b75a7d8f | 425 | U_STRING_INIT(option_02, "alternate", 9); |
374ca955 A |
426 | U_STRING_INIT(option_03, "backwards", 9); |
427 | U_STRING_INIT(option_04, "variable top", 12); | |
428 | U_STRING_INIT(option_05, "top", 3); | |
429 | U_STRING_INIT(option_06, "normalization", 13); | |
430 | U_STRING_INIT(option_07, "caseLevel", 9); | |
431 | U_STRING_INIT(option_08, "caseFirst", 9); | |
432 | U_STRING_INIT(option_09, "scriptOrder", 11); | |
433 | U_STRING_INIT(option_10, "charsetname", 11); | |
434 | U_STRING_INIT(option_11, "charset", 7); | |
435 | U_STRING_INIT(option_12, "before", 6); | |
b75a7d8f A |
436 | U_STRING_INIT(option_13, "hiraganaQ", 9); |
437 | U_STRING_INIT(option_14, "strength", 8); | |
438 | U_STRING_INIT(option_15, "first", 5); | |
439 | U_STRING_INIT(option_16, "last", 4); | |
440 | U_STRING_INIT(option_17, "optimize", 8); | |
441 | U_STRING_INIT(option_18, "suppressContractions", 20); | |
374ca955 | 442 | U_STRING_INIT(option_19, "numericOrdering", 15); |
b75a7d8f A |
443 | didInit = TRUE; |
444 | } | |
445 | } | |
446 | ||
447 | ||
448 | // This function reads basic options to set in the runtime collator | |
449 | // used by data driven tests. Should not support build time options | |
450 | U_CAPI const UChar * U_EXPORT2 | |
374ca955 A |
451 | ucol_tok_getNextArgument(const UChar *start, const UChar *end, |
452 | UColAttribute *attrib, UColAttributeValue *value, | |
b75a7d8f A |
453 | UErrorCode *status) { |
454 | uint32_t i = 0; | |
455 | int32_t j=0; | |
456 | UBool foundOption = FALSE; | |
457 | const UChar *optionArg = NULL; | |
458 | ||
459 | ucol_uprv_tok_initData(); | |
460 | ||
461 | while(start < end && u_isWhitespace(*start)) { /* eat whitespace */ | |
462 | start++; | |
463 | } | |
464 | if(start >= end) { | |
465 | return NULL; | |
466 | } | |
467 | /* skip opening '[' */ | |
468 | if(*start == 0x005b) { | |
469 | start++; | |
470 | } else { | |
471 | *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' | |
472 | return NULL; | |
473 | } | |
474 | ||
475 | while(i < UTOK_OPTION_COUNT) { | |
476 | if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { | |
477 | foundOption = TRUE; | |
478 | if(end - start > rulesOptions[i].optionLen) { | |
479 | optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ | |
480 | while(u_isWhitespace(*optionArg)) { /* eat whitespace */ | |
481 | optionArg++; | |
482 | } | |
374ca955 | 483 | } |
b75a7d8f A |
484 | break; |
485 | } | |
486 | i++; | |
487 | } | |
488 | ||
489 | if(!foundOption) { | |
490 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
491 | return NULL; | |
492 | } | |
493 | ||
494 | if(optionArg) { | |
495 | for(j = 0; j<rulesOptions[i].subSize; j++) { | |
496 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { | |
497 | //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); | |
498 | *attrib = rulesOptions[i].attr; | |
499 | *value = rulesOptions[i].subopts[j].attrVal; | |
500 | optionArg += rulesOptions[i].subopts[j].subLen; | |
501 | while(u_isWhitespace(*optionArg)) { /* eat whitespace */ | |
502 | optionArg++; | |
503 | } | |
504 | if(*optionArg == 0x005d) { | |
505 | optionArg++; | |
506 | return optionArg; | |
507 | } else { | |
508 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
509 | return NULL; | |
510 | } | |
511 | } | |
512 | } | |
513 | } | |
514 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
515 | return NULL; | |
516 | } | |
517 | ||
374ca955 | 518 | static |
b75a7d8f A |
519 | USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) { |
520 | while(*start != 0x005b) { /* advance while we find the first '[' */ | |
521 | start++; | |
522 | } | |
374ca955 | 523 | // now we need to get a balanced set of '[]'. The problem is that a set can have |
b75a7d8f A |
524 | // many, and *end point to the first closing '[' |
525 | int32_t noOpenBraces = 1; | |
526 | int32_t current = 1; // skip the opening brace | |
527 | while(start+current < end && noOpenBraces != 0) { | |
528 | if(start[current] == 0x005b) { | |
529 | noOpenBraces++; | |
530 | } else if(start[current] == 0x005D) { // closing brace | |
531 | noOpenBraces--; | |
532 | } | |
533 | current++; | |
534 | } | |
535 | UChar *nextBrace = NULL; | |
536 | ||
537 | if(noOpenBraces != 0 || (nextBrace = u_strchr(start+current, 0x005d /*']'*/)) == NULL) { | |
538 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
539 | return NULL; | |
540 | } | |
541 | return uset_openPattern(start, current, status); | |
542 | } | |
543 | ||
544 | static | |
545 | int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) { | |
546 | int32_t i = 0; | |
374ca955 A |
547 | ucol_uprv_tok_initData(); |
548 | ||
b75a7d8f A |
549 | while(u_isWhitespace(*start)) { /* eat whitespace */ |
550 | start++; | |
551 | } | |
552 | while(i < UTOK_OPTION_COUNT) { | |
553 | if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { | |
554 | if(end - start > rulesOptions[i].optionLen) { | |
555 | *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/ | |
556 | while(u_isWhitespace(**optionArg)) { /* eat whitespace */ | |
557 | (*optionArg)++; | |
558 | } | |
374ca955 | 559 | } |
b75a7d8f A |
560 | break; |
561 | } | |
562 | i++; | |
563 | } | |
564 | if(i == UTOK_OPTION_COUNT) { | |
565 | i = -1; // didn't find an option | |
374ca955 | 566 | } |
b75a7d8f A |
567 | return i; |
568 | } | |
569 | ||
570 | ||
571 | // reads and conforms to various options in rules | |
572 | // end is the position of the first closing ']' | |
573 | // However, some of the options take an UnicodeSet definition | |
574 | // which needs to duplicate the closing ']' | |
575 | // for example: '[copy [\uAC00-\uD7FF]]' | |
374ca955 | 576 | // These options will move end to the second ']' and the |
b75a7d8f A |
577 | // caller will set the current to it. |
578 | static | |
579 | uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) { | |
580 | const UChar* start = src->current; | |
581 | int32_t i = 0; | |
582 | int32_t j=0; | |
583 | const UChar *optionArg = NULL; | |
584 | ||
585 | uint8_t result = 0; | |
586 | ||
b75a7d8f A |
587 | start++; /*skip opening '['*/ |
588 | i = ucol_uprv_tok_readOption(start, src->end, &optionArg); | |
589 | if(optionArg) { | |
590 | src->current = optionArg; | |
591 | } | |
592 | ||
593 | if(i < 0) { | |
594 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
595 | } else { | |
596 | int32_t noOpenBraces = 1; | |
597 | switch(i) { | |
598 | case OPTION_ALTERNATE_HANDLING: | |
599 | case OPTION_FRENCH_COLLATION: | |
600 | case OPTION_CASE_LEVEL: | |
601 | case OPTION_CASE_FIRST: | |
602 | case OPTION_NORMALIZATION_MODE: | |
603 | case OPTION_HIRAGANA_QUATERNARY: | |
604 | case OPTION_STRENGTH: | |
605 | case OPTION_NUMERIC_COLLATION: | |
606 | if(optionArg) { | |
607 | for(j = 0; j<rulesOptions[i].subSize; j++) { | |
608 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { | |
609 | ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); | |
610 | result = UCOL_TOK_SUCCESS; | |
611 | } | |
612 | } | |
374ca955 | 613 | } |
b75a7d8f A |
614 | if(result == 0) { |
615 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
616 | } | |
617 | break; | |
618 | case OPTION_VARIABLE_TOP: | |
619 | result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; | |
620 | break; | |
621 | case OPTION_REARRANGE: | |
622 | result = UCOL_TOK_SUCCESS; | |
623 | break; | |
624 | case OPTION_BEFORE: | |
625 | if(optionArg) { | |
626 | for(j = 0; j<rulesOptions[i].subSize; j++) { | |
627 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { | |
628 | result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1; | |
629 | } | |
630 | } | |
631 | } | |
632 | if(result == 0) { | |
633 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
634 | } | |
635 | break; | |
636 | case OPTION_TOP: /* we are going to have an array with structures of limit CEs */ | |
637 | /* index to this array will be src->parsedToken.indirectIndex*/ | |
638 | src->parsedToken.indirectIndex = 0; | |
639 | result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; | |
640 | break; | |
641 | case OPTION_FIRST: | |
642 | case OPTION_LAST: /* first, last */ | |
643 | for(j = 0; j<rulesOptions[i].subSize; j++) { | |
644 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { | |
645 | // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first | |
646 | // element of indirect boundaries is reserved for top. | |
647 | src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2); | |
648 | result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; | |
649 | } | |
650 | } | |
651 | if(result == 0) { | |
652 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
653 | } | |
654 | break; | |
655 | case OPTION_OPTIMIZE: | |
656 | case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization | |
657 | // we need to move end here | |
658 | src->current++; // skip opening brace | |
659 | while(src->current < src->end && noOpenBraces != 0) { | |
660 | if(*src->current == 0x005b) { | |
661 | noOpenBraces++; | |
662 | } else if(*src->current == 0x005D) { // closing brace | |
663 | noOpenBraces--; | |
664 | } | |
665 | src->current++; | |
666 | } | |
667 | result = UCOL_TOK_SUCCESS; | |
668 | break; | |
669 | default: | |
670 | *status = U_UNSUPPORTED_ERROR; | |
671 | break; | |
672 | } | |
673 | } | |
674 | src->current = u_memchr(src->current, 0x005d, src->end-src->current); | |
675 | return result; | |
676 | } | |
677 | ||
374ca955 A |
678 | |
679 | inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) { | |
680 | if(src->extraCurrent+len >= src->extraEnd) { | |
681 | /* reallocate */ | |
682 | UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); | |
683 | if(newSrc != NULL) { | |
684 | src->current = newSrc + (src->current - src->source); | |
685 | src->extraCurrent = newSrc + (src->extraCurrent - src->source); | |
686 | src->end = newSrc + (src->end - src->source); | |
687 | src->extraEnd = newSrc + (src->extraEnd-src->source)*2; | |
688 | src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); | |
689 | src->source = newSrc; | |
690 | } else { | |
691 | *status = U_MEMORY_ALLOCATION_ERROR; | |
692 | } | |
693 | } | |
694 | if(len == 1) { | |
695 | *src->extraCurrent++ = *stuff; | |
696 | } else { | |
697 | uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar)); | |
698 | src->extraCurrent += len; | |
699 | } | |
700 | ||
701 | ||
702 | } | |
703 | ||
704 | inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { | |
b75a7d8f A |
705 | /* |
706 | top = TRUE; | |
707 | */ | |
374ca955 | 708 | UChar buff[5]; |
b75a7d8f | 709 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
374ca955 A |
710 | buff[0] = 0xFFFE; |
711 | buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); | |
712 | buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); | |
b75a7d8f A |
713 | if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) { |
714 | src->parsedToken.charsLen = 3; | |
374ca955 | 715 | ucol_tok_addToExtraCurrent(src, buff, 3, status); |
b75a7d8f | 716 | } else { |
374ca955 A |
717 | buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); |
718 | buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); | |
b75a7d8f | 719 | src->parsedToken.charsLen = 5; |
374ca955 A |
720 | ucol_tok_addToExtraCurrent(src, buff, 5, status); |
721 | } | |
b75a7d8f A |
722 | return TRUE; |
723 | } | |
724 | ||
374ca955 A |
725 | static UBool isCharNewLine(UChar c){ |
726 | switch(c){ | |
727 | case 0x000A: /* LF */ | |
728 | case 0x000D: /* CR */ | |
729 | case 0x000C: /* FF */ | |
730 | case 0x0085: /* NEL */ | |
731 | case 0x2028: /* LS */ | |
732 | case 0x2029: /* PS */ | |
733 | return TRUE; | |
734 | default: | |
735 | return FALSE; | |
736 | } | |
737 | } | |
738 | ||
b75a7d8f | 739 | U_CAPI const UChar* U_EXPORT2 |
374ca955 | 740 | ucol_tok_parseNextToken(UColTokenParser *src, |
b75a7d8f A |
741 | UBool startOfRules, |
742 | UParseError *parseError, | |
374ca955 | 743 | UErrorCode *status) { |
b75a7d8f A |
744 | /* parsing part */ |
745 | UBool variableTop = FALSE; | |
746 | UBool top = FALSE; | |
747 | UBool inChars = TRUE; | |
748 | UBool inQuote = FALSE; | |
749 | UBool wasInQuote = FALSE; | |
750 | UChar *optionEnd = NULL; | |
751 | uint8_t before = 0; | |
752 | UBool isEscaped = FALSE; | |
753 | // TODO: replace these variables with src->parsedToken counterparts | |
754 | // no need to use them anymore since we have src->parsedToken. | |
755 | // Ideally, token parser would be a nice class... Once, when I have | |
756 | // more time (around 2020 probably). | |
757 | uint32_t newExtensionLen = 0; | |
758 | uint32_t extensionOffset = 0; | |
374ca955 A |
759 | uint32_t newStrength = UCOL_TOK_UNSET; |
760 | UChar buff[10]; | |
b75a7d8f A |
761 | |
762 | src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; | |
763 | src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; | |
764 | src->parsedToken.indirectIndex = 0; | |
765 | ||
766 | while (src->current < src->end) { | |
767 | UChar ch = *(src->current); | |
768 | ||
769 | if (inQuote) { | |
770 | if (ch == 0x0027/*'\''*/) { | |
771 | inQuote = FALSE; | |
772 | } else { | |
773 | if ((src->parsedToken.charsLen == 0) || inChars) { | |
774 | if(src->parsedToken.charsLen == 0) { | |
775 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
776 | } | |
777 | src->parsedToken.charsLen++; | |
778 | } else { | |
779 | if(newExtensionLen == 0) { | |
780 | extensionOffset = (uint32_t)(src->extraCurrent - src->source); | |
781 | } | |
782 | newExtensionLen++; | |
783 | } | |
784 | } | |
785 | }else if(isEscaped){ | |
786 | isEscaped =FALSE; | |
787 | if (newStrength == UCOL_TOK_UNSET) { | |
374ca955 | 788 | *status = U_INVALID_FORMAT_ERROR; |
b75a7d8f A |
789 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
790 | return NULL; | |
791 | // enabling rules to start with non-tokens a < b | |
374ca955 | 792 | // newStrength = UCOL_TOK_RESET; |
b75a7d8f A |
793 | } |
794 | if(ch != 0x0000 && src->current != src->end) { | |
795 | if (inChars) { | |
796 | if(src->parsedToken.charsLen == 0) { | |
797 | src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); | |
798 | } | |
799 | src->parsedToken.charsLen++; | |
800 | } else { | |
801 | if(newExtensionLen == 0) { | |
802 | extensionOffset = (uint32_t)(src->current - src->source); | |
803 | } | |
804 | newExtensionLen++; | |
805 | } | |
806 | } | |
807 | }else { | |
808 | if(!uprv_isRuleWhiteSpace(ch)) { | |
809 | /* Sets the strength for this entry */ | |
810 | switch (ch) { | |
374ca955 | 811 | case 0x003D/*'='*/ : |
b75a7d8f A |
812 | if (newStrength != UCOL_TOK_UNSET) { |
813 | goto EndOfLoop; | |
814 | } | |
815 | ||
816 | /* if we start with strength, we'll reset to top */ | |
817 | if(startOfRules == TRUE) { | |
818 | src->parsedToken.indirectIndex = 5; | |
374ca955 | 819 | top = ucol_tok_doSetTop(src, status); |
b75a7d8f A |
820 | newStrength = UCOL_TOK_RESET; |
821 | goto EndOfLoop; | |
822 | } | |
823 | newStrength = UCOL_IDENTICAL; | |
824 | break; | |
825 | ||
374ca955 | 826 | case 0x002C/*','*/: |
b75a7d8f A |
827 | if (newStrength != UCOL_TOK_UNSET) { |
828 | goto EndOfLoop; | |
829 | } | |
830 | ||
831 | /* if we start with strength, we'll reset to top */ | |
832 | if(startOfRules == TRUE) { | |
833 | src->parsedToken.indirectIndex = 5; | |
374ca955 | 834 | top = ucol_tok_doSetTop(src, status); |
b75a7d8f A |
835 | newStrength = UCOL_TOK_RESET; |
836 | goto EndOfLoop; | |
837 | } | |
838 | newStrength = UCOL_TERTIARY; | |
839 | break; | |
840 | ||
841 | case 0x003B/*';'*/: | |
842 | if (newStrength != UCOL_TOK_UNSET) { | |
843 | goto EndOfLoop; | |
844 | } | |
845 | ||
846 | /* if we start with strength, we'll reset to top */ | |
847 | if(startOfRules == TRUE) { | |
848 | src->parsedToken.indirectIndex = 5; | |
374ca955 | 849 | top = ucol_tok_doSetTop(src, status); |
b75a7d8f A |
850 | newStrength = UCOL_TOK_RESET; |
851 | goto EndOfLoop; | |
852 | } | |
853 | newStrength = UCOL_SECONDARY; | |
854 | break; | |
855 | ||
374ca955 | 856 | case 0x003C/*'<'*/: |
b75a7d8f A |
857 | if (newStrength != UCOL_TOK_UNSET) { |
858 | goto EndOfLoop; | |
859 | } | |
860 | ||
861 | /* if we start with strength, we'll reset to top */ | |
862 | if(startOfRules == TRUE) { | |
863 | src->parsedToken.indirectIndex = 5; | |
374ca955 | 864 | top = ucol_tok_doSetTop(src, status); |
b75a7d8f A |
865 | newStrength = UCOL_TOK_RESET; |
866 | goto EndOfLoop; | |
867 | } | |
868 | /* before this, do a scan to verify whether this is */ | |
869 | /* another strength */ | |
870 | if(*(src->current+1) == 0x003C) { | |
871 | src->current++; | |
872 | if(*(src->current+1) == 0x003C) { | |
873 | src->current++; /* three in a row! */ | |
874 | newStrength = UCOL_TERTIARY; | |
875 | } else { /* two in a row */ | |
876 | newStrength = UCOL_SECONDARY; | |
877 | } | |
878 | } else { /* just one */ | |
879 | newStrength = UCOL_PRIMARY; | |
880 | } | |
881 | break; | |
882 | ||
374ca955 | 883 | case 0x0026/*'&'*/: |
b75a7d8f A |
884 | if (newStrength != UCOL_TOK_UNSET) { |
885 | /**/ | |
886 | goto EndOfLoop; | |
887 | } | |
888 | ||
889 | newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ | |
890 | break; | |
891 | ||
892 | case 0x005b/*'['*/: | |
893 | /* options - read an option, analyze it */ | |
894 | if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) { | |
895 | uint8_t result = ucol_uprv_tok_readAndSetOption(src, status); | |
896 | //src->current = optionEnd; | |
897 | if(U_SUCCESS(*status)) { | |
898 | if(result & UCOL_TOK_TOP) { | |
374ca955 A |
899 | if(newStrength == UCOL_TOK_RESET) { |
900 | top = ucol_tok_doSetTop(src, status); | |
b75a7d8f | 901 | if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b' |
b75a7d8f | 902 | src->parsedToken.charsLen+=2; |
374ca955 A |
903 | buff[0] = 0x002d; |
904 | buff[1] = before; | |
905 | ucol_tok_addToExtraCurrent(src, buff, 2, status); | |
b75a7d8f A |
906 | } |
907 | ||
908 | src->current++; | |
909 | goto EndOfLoop; | |
910 | } else { | |
911 | *status = U_INVALID_FORMAT_ERROR; | |
912 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
913 | } | |
914 | } else if(result & UCOL_TOK_VARIABLE_TOP) { | |
915 | if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { | |
916 | variableTop = TRUE; | |
917 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
918 | src->parsedToken.charsLen = 1; | |
374ca955 A |
919 | buff[0] = 0xFFFF; |
920 | ucol_tok_addToExtraCurrent(src, buff, 1, status); | |
b75a7d8f A |
921 | src->current++; |
922 | goto EndOfLoop; | |
923 | } else { | |
924 | *status = U_INVALID_FORMAT_ERROR; | |
925 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
926 | } | |
927 | } else if (result & UCOL_TOK_BEFORE){ | |
928 | if(newStrength == UCOL_TOK_RESET) { | |
929 | before = result & UCOL_TOK_BEFORE; | |
930 | } else { | |
931 | *status = U_INVALID_FORMAT_ERROR; | |
932 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
933 | ||
934 | } | |
374ca955 | 935 | } |
b75a7d8f A |
936 | } else { |
937 | *status = U_INVALID_FORMAT_ERROR; | |
938 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
939 | return NULL; | |
940 | } | |
941 | } | |
942 | break; | |
374ca955 A |
943 | case 0x0021/*! skip java thai modifier reordering*/: |
944 | break; | |
b75a7d8f A |
945 | case 0x002F/*'/'*/: |
946 | wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ | |
947 | inChars = FALSE; /* we're now processing expansion */ | |
948 | break; | |
949 | case 0x005C /* back slash for escaped chars */: | |
950 | isEscaped = TRUE; | |
951 | break; | |
952 | /* found a quote, we're gonna start copying */ | |
953 | case 0x0027/*'\''*/: | |
954 | if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ | |
374ca955 | 955 | *status = U_INVALID_FORMAT_ERROR; |
b75a7d8f A |
956 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
957 | return NULL; | |
374ca955 | 958 | // enabling rules to start with a non-token character a < b |
b75a7d8f A |
959 | // newStrength = UCOL_TOK_RESET; |
960 | } | |
961 | ||
962 | inQuote = TRUE; | |
963 | ||
964 | if(inChars) { /* we're doing characters */ | |
965 | if(wasInQuote == FALSE) { | |
966 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
967 | } | |
968 | if (src->parsedToken.charsLen != 0) { | |
374ca955 | 969 | ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); |
b75a7d8f A |
970 | } |
971 | src->parsedToken.charsLen++; | |
972 | } else { /* we're doing an expansion */ | |
973 | if(wasInQuote == FALSE) { | |
974 | extensionOffset = (uint32_t)(src->extraCurrent - src->source); | |
975 | } | |
976 | if (newExtensionLen != 0) { | |
374ca955 | 977 | ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status); |
b75a7d8f A |
978 | } |
979 | newExtensionLen++; | |
980 | } | |
981 | ||
982 | wasInQuote = TRUE; | |
983 | ||
374ca955 | 984 | ch = *(++(src->current)); |
b75a7d8f | 985 | if(ch == 0x0027) { /* copy the double quote */ |
374ca955 | 986 | ucol_tok_addToExtraCurrent(src, &ch, 1, status); |
b75a7d8f A |
987 | inQuote = FALSE; |
988 | } | |
989 | break; | |
990 | ||
991 | /* '@' is french only if the strength is not currently set */ | |
992 | /* if it is, it's just a regular character in collation rules */ | |
993 | case 0x0040/*'@'*/: | |
994 | if (newStrength == UCOL_TOK_UNSET) { | |
995 | src->opts->frenchCollation = UCOL_ON; | |
996 | break; | |
997 | } | |
998 | ||
999 | case 0x007C /*|*/: /* this means we have actually been reading prefix part */ | |
1000 | // we want to store read characters to the prefix part and continue reading | |
1001 | // the characters (proper way would be to restart reading the chars, but in | |
374ca955 | 1002 | // that case we would have to complicate the token hasher, which I do not |
b75a7d8f A |
1003 | // intend to play with. Instead, we will do prefixes when prefixes are due |
1004 | // (before adding the elements). | |
1005 | src->parsedToken.prefixOffset = src->parsedToken.charsOffset; | |
1006 | src->parsedToken.prefixLen = src->parsedToken.charsLen; | |
1007 | ||
1008 | if(inChars) { /* we're doing characters */ | |
1009 | if(wasInQuote == FALSE) { | |
1010 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
1011 | } | |
1012 | if (src->parsedToken.charsLen != 0) { | |
374ca955 | 1013 | ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); |
b75a7d8f A |
1014 | } |
1015 | src->parsedToken.charsLen++; | |
1016 | } | |
1017 | ||
1018 | wasInQuote = TRUE; | |
1019 | ||
1020 | do { | |
374ca955 | 1021 | ch = *(++(src->current)); |
b75a7d8f A |
1022 | // skip whitespace between '|' and the character |
1023 | } while (uprv_isRuleWhiteSpace(ch)); | |
1024 | break; | |
374ca955 | 1025 | |
b75a7d8f A |
1026 | //charsOffset = 0; |
1027 | //newCharsLen = 0; | |
1028 | //break; // We want to store the whole prefix/character sequence. If we break | |
1029 | // the '|' is going to get lost. | |
374ca955 A |
1030 | case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */ |
1031 | do { | |
1032 | ch = *(++(src->current)); | |
1033 | } while (!isCharNewLine(ch)); | |
1034 | ||
1035 | break; | |
b75a7d8f A |
1036 | default: |
1037 | if (newStrength == UCOL_TOK_UNSET) { | |
374ca955 | 1038 | *status = U_INVALID_FORMAT_ERROR; |
b75a7d8f A |
1039 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
1040 | return NULL; | |
1041 | } | |
1042 | ||
1043 | if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { | |
1044 | *status = U_INVALID_FORMAT_ERROR; | |
1045 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); | |
1046 | return NULL; | |
1047 | } | |
1048 | ||
1049 | if(ch == 0x0000 && src->current+1 == src->end) { | |
1050 | break; | |
1051 | } | |
1052 | ||
1053 | if (inChars) { | |
1054 | if(src->parsedToken.charsLen == 0) { | |
1055 | src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); | |
1056 | } | |
1057 | src->parsedToken.charsLen++; | |
1058 | } else { | |
1059 | if(newExtensionLen == 0) { | |
1060 | extensionOffset = (uint32_t)(src->current - src->source); | |
1061 | } | |
1062 | newExtensionLen++; | |
1063 | } | |
1064 | ||
1065 | break; | |
374ca955 | 1066 | } |
b75a7d8f A |
1067 | } |
1068 | } | |
1069 | ||
1070 | if(wasInQuote) { | |
1071 | if(ch != 0x27) { | |
374ca955 A |
1072 | if(inQuote || !uprv_isRuleWhiteSpace(ch)) { |
1073 | ucol_tok_addToExtraCurrent(src, &ch, 1, status); | |
1074 | } | |
b75a7d8f A |
1075 | } |
1076 | } | |
1077 | ||
1078 | src->current++; | |
1079 | } | |
1080 | ||
1081 | EndOfLoop: | |
1082 | wasInQuote = FALSE; | |
1083 | if (newStrength == UCOL_TOK_UNSET) { | |
1084 | return NULL; | |
1085 | } | |
1086 | ||
1087 | if (src->parsedToken.charsLen == 0 && top == FALSE) { | |
374ca955 | 1088 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
b75a7d8f A |
1089 | *status = U_INVALID_FORMAT_ERROR; |
1090 | return NULL; | |
1091 | } | |
1092 | ||
374ca955 | 1093 | src->parsedToken.strength = newStrength; |
b75a7d8f A |
1094 | src->parsedToken.extensionOffset = extensionOffset; |
1095 | src->parsedToken.extensionLen = newExtensionLen; | |
1096 | src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; | |
1097 | ||
1098 | return src->current; | |
1099 | } | |
1100 | ||
1101 | /* | |
1102 | Processing Description | |
374ca955 A |
1103 | 1 Build a ListList. Each list has a header, which contains two lists (positive |
1104 | and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and | |
1105 | reset may be null. | |
1106 | 2 As you process, you keep a LAST pointer that points to the last token you | |
1107 | handled. | |
b75a7d8f A |
1108 | */ |
1109 | ||
1110 | static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext, | |
1111 | UParseError *parseError, UErrorCode *status) { | |
1112 | if(src->resultLen == src->listCapacity) { | |
1113 | // Unfortunately, this won't work, as we store addresses of lhs in token | |
1114 | src->listCapacity *= 2; | |
1115 | src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader)); | |
1116 | if(src->lh == NULL) { | |
1117 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1118 | return NULL; | |
1119 | } | |
1120 | } | |
1121 | /* do the reset thing */ | |
1122 | UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); | |
1123 | /* test for NULL */ | |
1124 | if (sourceToken == NULL) { | |
1125 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1126 | return NULL; | |
1127 | } | |
1128 | sourceToken->rulesToParse = src->source; | |
1129 | sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; | |
1130 | sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; | |
1131 | ||
1132 | sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); | |
1133 | sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); | |
1134 | ||
374ca955 A |
1135 | // keep the flags around so that we know about before |
1136 | sourceToken->flags = src->parsedToken.flags; | |
1137 | ||
b75a7d8f | 1138 | if(src->parsedToken.prefixOffset != 0) { |
374ca955 | 1139 | // this is a syntax error |
b75a7d8f A |
1140 | *status = U_INVALID_FORMAT_ERROR; |
1141 | syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); | |
1142 | return 0; | |
1143 | } else { | |
1144 | sourceToken->prefix = 0; | |
1145 | } | |
1146 | ||
1147 | sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ | |
1148 | sourceToken->strength = UCOL_TOK_RESET; | |
1149 | sourceToken->next = NULL; | |
1150 | sourceToken->previous = NULL; | |
1151 | sourceToken->noOfCEs = 0; | |
1152 | sourceToken->noOfExpCEs = 0; | |
1153 | sourceToken->listHeader = &src->lh[src->resultLen]; | |
1154 | ||
1155 | src->lh[src->resultLen].first = NULL; | |
1156 | src->lh[src->resultLen].last = NULL; | |
1157 | src->lh[src->resultLen].first = NULL; | |
1158 | src->lh[src->resultLen].last = NULL; | |
1159 | ||
1160 | src->lh[src->resultLen].reset = sourceToken; | |
1161 | ||
1162 | /* | |
374ca955 A |
1163 | 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
1164 | First convert all expansions into normal form. Examples: | |
1165 | If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * | |
1166 | d * ... into &x * c/y * d * ... | |
1167 | Note: reset values can never have expansions, although they can cause the | |
1168 | very next item to have one. They may be contractions, if they are found | |
1169 | earlier in the list. | |
b75a7d8f A |
1170 | */ |
1171 | if(expand != NULL) { | |
1172 | /* check to see if there is an expansion */ | |
1173 | if(src->parsedToken.charsLen > 1) { | |
1174 | uint32_t resetCharsOffset; | |
1175 | resetCharsOffset = (uint32_t)(expand - src->source); | |
1176 | sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset; | |
1177 | *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); | |
1178 | } else { | |
1179 | *expandNext = 0; | |
1180 | } | |
1181 | } | |
1182 | ||
1183 | src->resultLen++; | |
1184 | ||
1185 | uhash_put(src->tailored, sourceToken, sourceToken, status); | |
1186 | ||
1187 | return sourceToken; | |
1188 | } | |
1189 | ||
1190 | static | |
1191 | inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) { | |
1192 | if(U_FAILURE(*status)) { | |
1193 | return NULL; | |
1194 | } | |
1195 | /* this is a virgin before - we need to fish the anchor from the UCA */ | |
1196 | collIterate s; | |
1197 | uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; | |
1198 | uint32_t CE, SecondCE; | |
1199 | uint32_t invPos; | |
1200 | if(sourceToken != NULL) { | |
374ca955 | 1201 | uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s); |
b75a7d8f | 1202 | } else { |
374ca955 | 1203 | uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s); |
b75a7d8f A |
1204 | } |
1205 | ||
1206 | baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; | |
1207 | baseContCE = ucol_getNextCE(src->UCA, &s, status); | |
1208 | if(baseContCE == UCOL_NO_MORE_CES) { | |
1209 | baseContCE = 0; | |
1210 | } | |
1211 | ||
374ca955 A |
1212 | |
1213 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); | |
1214 | uint32_t ch = 0; | |
b75a7d8f | 1215 | uint32_t expandNext = 0; |
374ca955 A |
1216 | UColToken key; |
1217 | ||
1218 | if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ | |
1219 | uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16; | |
1220 | uint32_t raw = uprv_uca_getRawFromImplicit(primary); | |
1221 | ch = uprv_uca_getCodePointFromRaw(raw-1); | |
1222 | uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); | |
1223 | CE = primaryCE & UCOL_PRIMARYMASK | 0x0505; | |
1224 | SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER; | |
1225 | ||
1226 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
1227 | *src->extraCurrent++ = 0xFFFE; | |
1228 | *src->extraCurrent++ = (UChar)ch; | |
1229 | src->parsedToken.charsLen++; | |
1230 | ||
1231 | key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; | |
1232 | key.rulesToParse = src->source; | |
1233 | ||
1234 | //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); | |
1235 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
1236 | ||
1237 | if(sourceToken == NULL) { | |
1238 | src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; | |
1239 | if(isContinuation(SecondCE)) { | |
1240 | src->lh[src->resultLen].baseContCE = SecondCE; | |
1241 | } else { | |
1242 | src->lh[src->resultLen].baseContCE = 0; | |
1243 | } | |
1244 | src->lh[src->resultLen].nextCE = 0; | |
1245 | src->lh[src->resultLen].nextContCE = 0; | |
1246 | src->lh[src->resultLen].previousCE = 0; | |
1247 | src->lh[src->resultLen].previousContCE = 0; | |
1248 | ||
1249 | src->lh[src->resultLen].indirect = FALSE; | |
1250 | ||
1251 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); | |
1252 | } | |
1253 | ||
1254 | } else { | |
1255 | invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); | |
1256 | ||
1257 | // we got the previous CE. Now we need to see if the difference between | |
1258 | // the two CEs is really of the requested strength. | |
1259 | // if it's a bigger difference (we asked for secondary and got primary), we | |
1260 | // need to modify the CE. | |
1261 | if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) { | |
1262 | // adjust the strength | |
1263 | // now we are in the situation where our baseCE should actually be modified in | |
1264 | // order to get the CE in the right position. | |
1265 | if(strength == UCOL_SECONDARY) { | |
1266 | CE = baseCE - 0x0200; | |
1267 | } else { // strength == UCOL_TERTIARY | |
1268 | CE = baseCE - 0x02; | |
1269 | } | |
1270 | if(baseContCE) { | |
1271 | if(strength == UCOL_SECONDARY) { | |
1272 | SecondCE = baseContCE - 0x0200; | |
1273 | } else { // strength == UCOL_TERTIARY | |
1274 | SecondCE = baseContCE - 0x02; | |
1275 | } | |
1276 | } | |
1277 | } | |
1278 | ||
1279 | #if 0 | |
1280 | // the code below relies on getting a code point from the inverse table, in order to be | |
1281 | // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: | |
1282 | // 1. There are many code points that have the same CE | |
1283 | // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. | |
1284 | // Also, in case when there is no equivalent strength before an element, we have to actually | |
1285 | // construct one. For example, &[before 2]a << x won't result in x << a, because the element | |
1286 | // before a is a primary difference. | |
1287 | ||
1288 | //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
1289 | ||
1290 | ||
1291 | ch = CETable[3*invPos+2]; | |
1292 | ||
1293 | if((ch & UCOL_INV_SIZEMASK) != 0) { | |
1294 | uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts); | |
1295 | uint32_t offset = (ch & UCOL_INV_OFFSETMASK); | |
1296 | ch = conts[offset]; | |
1297 | } | |
1298 | ||
1299 | *src->extraCurrent++ = (UChar)ch; | |
1300 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1); | |
1301 | src->parsedToken.charsLen = 1; | |
1302 | ||
1303 | // We got an UCA before. However, this might have been tailored. | |
1304 | // example: | |
1305 | // &\u30ca = \u306a | |
1306 | // &[before 3]\u306a<<<\u306a|\u309d | |
1307 | ||
1308 | ||
1309 | // uint32_t key = (*newCharsLen << 24) | *charsOffset; | |
1310 | key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; | |
1311 | key.rulesToParse = src->source; | |
1312 | ||
1313 | //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); | |
1314 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
1315 | #endif | |
1316 | ||
1317 | // here is how it should be. The situation such as &[before 1]a < x, should be | |
1318 | // resolved exactly as if we wrote &a > x. | |
1319 | // therefore, I don't really care if the UCA value before a has been changed. | |
1320 | // However, I do care if the strength between my element and the previous element | |
1321 | // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll | |
1322 | // have to construct the base CE. | |
1323 | ||
1324 | ||
1325 | ||
1326 | // if we found a tailored thing, we have to use the UCA value and construct | |
1327 | // a new reset token with constructed name | |
1328 | //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { | |
1329 | // character to which we want to anchor is already tailored. | |
1330 | // We need to construct a new token which will be the anchor | |
1331 | // point | |
1332 | //*(src->extraCurrent-1) = 0xFFFE; | |
1333 | //*src->extraCurrent++ = (UChar)ch; | |
1334 | // grab before | |
1335 | src->parsedToken.charsOffset -= 10; | |
1336 | src->parsedToken.charsLen += 10; | |
1337 | src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; | |
1338 | if(isContinuation(SecondCE)) { | |
1339 | src->lh[src->resultLen].baseContCE = SecondCE; | |
1340 | } else { | |
1341 | src->lh[src->resultLen].baseContCE = 0; | |
1342 | } | |
1343 | src->lh[src->resultLen].nextCE = 0; | |
1344 | src->lh[src->resultLen].nextContCE = 0; | |
1345 | src->lh[src->resultLen].previousCE = 0; | |
1346 | src->lh[src->resultLen].previousContCE = 0; | |
b75a7d8f | 1347 | |
374ca955 | 1348 | src->lh[src->resultLen].indirect = FALSE; |
b75a7d8f | 1349 | |
374ca955 A |
1350 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
1351 | //} | |
b75a7d8f A |
1352 | } |
1353 | ||
1354 | return sourceToken; | |
1355 | ||
1356 | } | |
1357 | ||
1358 | uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) { | |
1359 | UColToken *lastToken = NULL; | |
1360 | const UChar *parseEnd = NULL; | |
1361 | uint32_t expandNext = 0; | |
1362 | UBool variableTop = FALSE; | |
1363 | UBool top = FALSE; | |
1364 | uint16_t specs = 0; | |
1365 | UColTokListHeader *ListList = NULL; | |
1366 | ||
374ca955 | 1367 | src->parsedToken.strength = UCOL_TOK_UNSET; |
b75a7d8f A |
1368 | |
1369 | ListList = src->lh; | |
1370 | ||
1371 | if(U_FAILURE(*status)) { | |
1372 | return 0; | |
1373 | } | |
1374 | ||
1375 | while(src->current < src->end) { | |
1376 | src->parsedToken.prefixOffset = 0; | |
374ca955 A |
1377 | |
1378 | parseEnd = ucol_tok_parseNextToken(src, | |
b75a7d8f A |
1379 | (UBool)(lastToken == NULL), |
1380 | parseError, | |
1381 | status); | |
1382 | ||
1383 | specs = src->parsedToken.flags; | |
1384 | ||
1385 | ||
1386 | variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); | |
1387 | top = ((specs & UCOL_TOK_TOP) != 0); | |
1388 | ||
1389 | if(U_SUCCESS(*status) && parseEnd != NULL) { | |
1390 | UColToken *sourceToken = NULL; | |
1391 | //uint32_t key = 0; | |
1392 | uint32_t lastStrength = UCOL_TOK_UNSET; | |
374ca955 | 1393 | |
b75a7d8f A |
1394 | if(lastToken != NULL ) { |
1395 | lastStrength = lastToken->strength; | |
1396 | } | |
1397 | ||
1398 | //key = newCharsLen << 24 | charsOffset; | |
1399 | UColToken key; | |
1400 | key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; | |
1401 | key.rulesToParse = src->source; | |
1402 | ||
1403 | /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ | |
1404 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
1405 | ||
1406 | if(src->parsedToken.strength != UCOL_TOK_RESET) { | |
1407 | if(lastToken == NULL) { /* this means that rules haven't started properly */ | |
1408 | *status = U_INVALID_FORMAT_ERROR; | |
1409 | syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); | |
1410 | return 0; | |
1411 | } | |
1412 | /* 6 Otherwise (when relation != reset) */ | |
1413 | if(sourceToken == NULL) { | |
1414 | /* If sourceToken is null, create new one, */ | |
1415 | sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); | |
1416 | /* test for NULL */ | |
1417 | if (sourceToken == NULL) { | |
1418 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1419 | return 0; | |
1420 | } | |
1421 | sourceToken->rulesToParse = src->source; | |
1422 | sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; | |
1423 | ||
1424 | sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); | |
1425 | ||
1426 | sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset; | |
1427 | sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset); | |
1428 | ||
1429 | sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ | |
1430 | sourceToken->next = NULL; | |
1431 | sourceToken->previous = NULL; | |
1432 | sourceToken->noOfCEs = 0; | |
1433 | sourceToken->noOfExpCEs = 0; | |
374ca955 A |
1434 | // keep the flags around so that we know about before |
1435 | sourceToken->flags = src->parsedToken.flags; | |
b75a7d8f A |
1436 | uhash_put(src->tailored, sourceToken, sourceToken, status); |
1437 | } else { | |
1438 | /* we could have fished out a reset here */ | |
1439 | if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { | |
1440 | /* otherwise remove sourceToken from where it was. */ | |
1441 | if(sourceToken->next != NULL) { | |
1442 | if(sourceToken->next->strength > sourceToken->strength) { | |
1443 | sourceToken->next->strength = sourceToken->strength; | |
1444 | } | |
1445 | sourceToken->next->previous = sourceToken->previous; | |
1446 | } else { | |
1447 | sourceToken->listHeader->last = sourceToken->previous; | |
1448 | } | |
1449 | ||
1450 | if(sourceToken->previous != NULL) { | |
1451 | sourceToken->previous->next = sourceToken->next; | |
1452 | } else { | |
1453 | sourceToken->listHeader->first = sourceToken->next; | |
1454 | } | |
1455 | sourceToken->next = NULL; | |
1456 | sourceToken->previous = NULL; | |
1457 | } | |
1458 | } | |
1459 | ||
1460 | sourceToken->strength = src->parsedToken.strength; | |
1461 | sourceToken->listHeader = lastToken->listHeader; | |
1462 | ||
1463 | /* | |
374ca955 A |
1464 | 1. Find the strongest strength in each list, and set strongestP and strongestN |
1465 | accordingly in the headers. | |
b75a7d8f | 1466 | */ |
374ca955 | 1467 | if(lastStrength == UCOL_TOK_RESET |
b75a7d8f | 1468 | || sourceToken->listHeader->first == 0) { |
374ca955 | 1469 | /* If LAST is a reset |
b75a7d8f A |
1470 | insert sourceToken in the list. */ |
1471 | if(sourceToken->listHeader->first == 0) { | |
1472 | sourceToken->listHeader->first = sourceToken; | |
1473 | sourceToken->listHeader->last = sourceToken; | |
1474 | } else { /* we need to find a place for us */ | |
1475 | /* and we'll get in front of the same strength */ | |
1476 | if(sourceToken->listHeader->first->strength <= sourceToken->strength) { | |
1477 | sourceToken->next = sourceToken->listHeader->first; | |
1478 | sourceToken->next->previous = sourceToken; | |
1479 | sourceToken->listHeader->first = sourceToken; | |
1480 | sourceToken->previous = NULL; | |
1481 | } else { | |
1482 | lastToken = sourceToken->listHeader->first; | |
1483 | while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { | |
1484 | lastToken = lastToken->next; | |
1485 | } | |
1486 | if(lastToken->next != NULL) { | |
1487 | lastToken->next->previous = sourceToken; | |
1488 | } else { | |
1489 | sourceToken->listHeader->last = sourceToken; | |
1490 | } | |
1491 | sourceToken->previous = lastToken; | |
1492 | sourceToken->next = lastToken->next; | |
1493 | lastToken->next = sourceToken; | |
1494 | } | |
1495 | } | |
1496 | } else { | |
374ca955 A |
1497 | /* Otherwise (when LAST is not a reset) |
1498 | if polarity (LAST) == polarity(relation), insert sourceToken after LAST, | |
1499 | otherwise insert before. | |
1500 | when inserting after or before, search to the next position with the same | |
b75a7d8f | 1501 | strength in that direction. (This is called postpone insertion). */ |
374ca955 | 1502 | if(sourceToken != lastToken) { |
b75a7d8f A |
1503 | if(lastToken->polarity == sourceToken->polarity) { |
1504 | while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { | |
1505 | lastToken = lastToken->next; | |
1506 | } | |
1507 | sourceToken->previous = lastToken; | |
1508 | if(lastToken->next != NULL) { | |
1509 | lastToken->next->previous = sourceToken; | |
1510 | } else { | |
1511 | sourceToken->listHeader->last = sourceToken; | |
1512 | } | |
1513 | ||
1514 | sourceToken->next = lastToken->next; | |
1515 | lastToken->next = sourceToken; | |
1516 | } else { | |
1517 | while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { | |
1518 | lastToken = lastToken->previous; | |
1519 | } | |
1520 | sourceToken->next = lastToken; | |
1521 | if(lastToken->previous != NULL) { | |
1522 | lastToken->previous->next = sourceToken; | |
1523 | } else { | |
1524 | sourceToken->listHeader->first = sourceToken; | |
1525 | } | |
1526 | sourceToken->previous = lastToken->previous; | |
1527 | lastToken->previous = sourceToken; | |
1528 | } | |
1529 | } else { /* repeated one thing twice in rules, stay with the stronger strength */ | |
1530 | if(lastStrength < sourceToken->strength) { | |
1531 | sourceToken->strength = lastStrength; | |
1532 | } | |
1533 | } | |
1534 | } | |
1535 | ||
1536 | /* if the token was a variable top, we're gonna put it in */ | |
1537 | if(variableTop == TRUE && src->varTop == NULL) { | |
1538 | variableTop = FALSE; | |
1539 | src->varTop = sourceToken; | |
1540 | } | |
1541 | ||
1542 | // Treat the expansions. | |
374ca955 A |
1543 | // There are two types of expansions: explicit (x / y) and reset based propagating expansions |
1544 | // (&abc * d * e <=> &ab * d / c * e / c) | |
b75a7d8f A |
1545 | // if both of them are in effect for a token, they are combined. |
1546 | ||
1547 | sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; | |
1548 | ||
1549 | if(expandNext != 0) { | |
1550 | if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ | |
1551 | expandNext = 0; | |
1552 | } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ | |
1553 | sourceToken->expansion = expandNext; | |
1554 | } else { /* there is both explicit and implicit expansion. We need to make a combination */ | |
1555 | uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); | |
1556 | uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar)); | |
1557 | sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (src->extraCurrent - src->source)); | |
1558 | src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen; | |
1559 | } | |
1560 | } | |
1561 | ||
1562 | // This is just for debugging purposes | |
1563 | if(sourceToken->expansion != 0) { | |
1564 | sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); | |
1565 | } else { | |
1566 | sourceToken->debugExpansion = 0; | |
1567 | } | |
374ca955 A |
1568 | // if the previous token was a reset before, the strength of this |
1569 | // token must match the strength of before. Otherwise we have an | |
1570 | // undefined situation. | |
1571 | // In other words, we currently have a cludge which we use to | |
1572 | // represent &a >> x. This is written as &[before 2]a << x. | |
1573 | if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { | |
1574 | uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1; | |
1575 | if(beforeStrength != sourceToken->strength) { | |
1576 | *status = U_INVALID_FORMAT_ERROR; | |
1577 | syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); | |
1578 | return 0; | |
1579 | } | |
1580 | } | |
b75a7d8f A |
1581 | } else { |
1582 | if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { | |
1583 | /* if the previous token was also a reset, */ | |
1584 | /*this means that we have two consecutive resets */ | |
1585 | /* and we want to remove the previous one if empty*/ | |
1586 | if(ListList[src->resultLen-1].first == NULL) { | |
1587 | src->resultLen--; | |
1588 | } | |
1589 | } | |
1590 | ||
1591 | if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ | |
1592 | uint32_t searchCharsLen = src->parsedToken.charsLen; | |
1593 | while(searchCharsLen > 1 && sourceToken == NULL) { | |
1594 | searchCharsLen--; | |
1595 | //key = searchCharsLen << 24 | charsOffset; | |
1596 | UColToken key; | |
1597 | key.source = searchCharsLen << 24 | src->parsedToken.charsOffset; | |
1598 | key.rulesToParse = src->source; | |
1599 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
1600 | } | |
1601 | if(sourceToken != NULL) { | |
1602 | expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen); | |
1603 | } | |
1604 | } | |
1605 | ||
1606 | if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ | |
1607 | if(top == FALSE) { /* there is no indirection */ | |
1608 | uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; | |
374ca955 | 1609 | if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { |
b75a7d8f A |
1610 | /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ |
1611 | while(sourceToken->strength > strength && sourceToken->previous != NULL) { | |
1612 | sourceToken = sourceToken->previous; | |
1613 | } | |
1614 | /* here, either we hit the strength or NULL */ | |
1615 | if(sourceToken->strength == strength) { | |
1616 | if(sourceToken->previous != NULL) { | |
1617 | sourceToken = sourceToken->previous; | |
1618 | } else { /* start of list */ | |
1619 | sourceToken = sourceToken->listHeader->reset; | |
374ca955 | 1620 | } |
b75a7d8f A |
1621 | } else { /* we hit NULL */ |
1622 | /* we should be doing the else part */ | |
1623 | sourceToken = sourceToken->listHeader->reset; | |
1624 | sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); | |
1625 | } | |
1626 | } else { | |
1627 | sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); | |
1628 | } | |
1629 | } else { /* this is both before and indirection */ | |
1630 | top = FALSE; | |
1631 | ListList[src->resultLen].previousCE = 0; | |
1632 | ListList[src->resultLen].previousContCE = 0; | |
1633 | ListList[src->resultLen].indirect = TRUE; | |
1634 | /* we need to do slightly more work. we need to get the baseCE using the */ | |
1635 | /* inverse UCA & getPrevious. The next bound is not set, and will be decided */ | |
1636 | /* in ucol_bld */ | |
1637 | uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; | |
1638 | uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; | |
1639 | uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F; | |
1640 | uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; | |
1641 | ||
374ca955 A |
1642 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
1643 | if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ | |
1644 | uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16; | |
1645 | uint32_t raw = uprv_uca_getRawFromImplicit(primary); | |
1646 | uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); | |
1647 | CE = primaryCE & UCOL_PRIMARYMASK | 0x0505; | |
1648 | SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER; | |
1649 | } else { | |
1650 | /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/ | |
1651 | ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); | |
1652 | } | |
b75a7d8f A |
1653 | |
1654 | ListList[src->resultLen].baseCE = CE; | |
1655 | ListList[src->resultLen].baseContCE = SecondCE; | |
1656 | ListList[src->resultLen].nextCE = 0; | |
1657 | ListList[src->resultLen].nextContCE = 0; | |
1658 | ||
1659 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); | |
1660 | } | |
1661 | } | |
1662 | ||
1663 | ||
374ca955 A |
1664 | /* 5 If the relation is a reset: |
1665 | If sourceToken is null | |
1666 | Create new list, create new sourceToken, make the baseCE from source, put | |
b75a7d8f A |
1667 | the sourceToken in ListHeader of the new list */ |
1668 | if(sourceToken == NULL) { | |
1669 | /* | |
374ca955 A |
1670 | 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
1671 | First convert all expansions into normal form. Examples: | |
1672 | If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * | |
1673 | d * ... into &x * c/y * d * ... | |
1674 | Note: reset values can never have expansions, although they can cause the | |
1675 | very next item to have one. They may be contractions, if they are found | |
1676 | earlier in the list. | |
b75a7d8f A |
1677 | */ |
1678 | if(top == FALSE) { | |
1679 | collIterate s; | |
1680 | uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; | |
1681 | ||
1682 | uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s); | |
1683 | ||
1684 | CE = ucol_getNextCE(src->UCA, &s, status); | |
1685 | UChar *expand = s.pos; | |
1686 | SecondCE = ucol_getNextCE(src->UCA, &s, status); | |
1687 | ||
1688 | ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; | |
1689 | if(isContinuation(SecondCE)) { | |
1690 | ListList[src->resultLen].baseContCE = SecondCE; | |
1691 | } else { | |
1692 | ListList[src->resultLen].baseContCE = 0; | |
1693 | } | |
1694 | ListList[src->resultLen].nextCE = 0; | |
1695 | ListList[src->resultLen].nextContCE = 0; | |
1696 | ListList[src->resultLen].previousCE = 0; | |
1697 | ListList[src->resultLen].previousContCE = 0; | |
1698 | ListList[src->resultLen].indirect = FALSE; | |
1699 | sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); | |
1700 | } else { /* top == TRUE */ | |
1701 | /* just use the supplied values */ | |
1702 | top = FALSE; | |
1703 | ListList[src->resultLen].previousCE = 0; | |
1704 | ListList[src->resultLen].previousContCE = 0; | |
1705 | ListList[src->resultLen].indirect = TRUE; | |
1706 | ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; | |
1707 | ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE; | |
1708 | ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE; | |
1709 | ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE; | |
1710 | ||
1711 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); | |
1712 | ||
1713 | } | |
1714 | } else { /* reset to something already in rules */ | |
1715 | top = FALSE; | |
1716 | } | |
1717 | } | |
374ca955 | 1718 | /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ |
b75a7d8f A |
1719 | lastToken = sourceToken; |
1720 | } else { | |
1721 | if(U_FAILURE(*status)) { | |
1722 | return 0; | |
1723 | } | |
1724 | } | |
1725 | } | |
1726 | ||
1727 | if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { | |
1728 | src->resultLen--; | |
1729 | } | |
1730 | return src->resultLen; | |
1731 | } | |
1732 | ||
374ca955 | 1733 | void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) { |
b75a7d8f A |
1734 | uint32_t nSize = 0; |
1735 | uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); | |
1736 | if(U_FAILURE(*status)) { | |
1737 | return; | |
1738 | } | |
374ca955 | 1739 | |
b75a7d8f A |
1740 | // set everything to zero, so that we can clean up gracefully |
1741 | uprv_memset(src, 0, sizeof(UColTokenParser)); | |
374ca955 | 1742 | |
b75a7d8f A |
1743 | // first we need to find options that don't like to be normalized, |
1744 | // like copy and remove... | |
1745 | //const UChar *openBrace = rules; | |
1746 | int32_t optionNumber = -1; | |
1747 | const UChar *setStart; | |
1748 | uint32_t i = 0; | |
1749 | while(i < rulesLength) { | |
1750 | if(rules[i] == 0x005B) { | |
1751 | // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces | |
1752 | //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart); | |
1753 | optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart); | |
1754 | if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */ | |
1755 | USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); | |
1756 | if(U_SUCCESS(*status)) { | |
1757 | if(src->copySet == NULL) { | |
1758 | src->copySet = newSet; | |
1759 | } else { | |
1760 | ((UnicodeSet *)src->copySet)->addAll(*((UnicodeSet *)newSet)); | |
1761 | uset_close(newSet); | |
1762 | } | |
1763 | } else { | |
1764 | return; | |
1765 | } | |
1766 | } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { | |
1767 | USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); | |
1768 | if(U_SUCCESS(*status)) { | |
1769 | if(src->removeSet == NULL) { | |
1770 | src->removeSet = newSet; | |
1771 | } else { | |
1772 | ((UnicodeSet *)src->removeSet)->addAll(*((UnicodeSet *)newSet)); | |
1773 | uset_close(newSet); | |
1774 | } | |
1775 | } else { | |
1776 | return; | |
1777 | } | |
1778 | } | |
1779 | } | |
1780 | //openBrace++; | |
1781 | i++; | |
1782 | } | |
1783 | ||
1784 | src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); | |
1785 | /* test for NULL */ | |
1786 | if (src->source == NULL) { | |
1787 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1788 | return; | |
1789 | } | |
374ca955 | 1790 | uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); |
b75a7d8f A |
1791 | nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status); |
1792 | if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { | |
1793 | *status = U_ZERO_ERROR; | |
1794 | src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); | |
1795 | /* test for NULL */ | |
1796 | if (src->source == NULL) { | |
1797 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1798 | return; | |
1799 | } | |
1800 | nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); | |
1801 | } | |
1802 | src->current = src->source; | |
1803 | src->end = src->source+nSize; | |
1804 | src->sourceCurrent = src->source; | |
374ca955 | 1805 | src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly |
b75a7d8f A |
1806 | src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; |
1807 | src->varTop = NULL; | |
1808 | src->UCA = UCA; | |
1809 | src->invUCA = ucol_initInverseUCA(status); | |
1810 | src->parsedToken.charsLen = 0; | |
1811 | src->parsedToken.charsOffset = 0; | |
1812 | src->parsedToken.extensionLen = 0; | |
1813 | src->parsedToken.extensionOffset = 0; | |
1814 | src->parsedToken.prefixLen = 0; | |
1815 | src->parsedToken.prefixOffset = 0; | |
1816 | src->parsedToken.flags = 0; | |
1817 | src->parsedToken.strength = UCOL_TOK_UNSET; | |
1818 | ||
1819 | ||
1820 | if(U_FAILURE(*status)) { | |
1821 | return; | |
1822 | } | |
1823 | src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, status); | |
1824 | if(U_FAILURE(*status)) { | |
1825 | return; | |
1826 | } | |
1827 | uhash_setValueDeleter(src->tailored, uhash_freeBlock); | |
1828 | ||
1829 | src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); | |
1830 | /* test for NULL */ | |
1831 | if (src->opts == NULL) { | |
1832 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1833 | return; | |
1834 | } | |
1835 | ||
1836 | uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); | |
1837 | ||
1838 | // rulesToParse = src->source; | |
1839 | src->lh = 0; | |
1840 | src->listCapacity = 1024; | |
1841 | src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader)); | |
1842 | //Test for NULL | |
1843 | if (src->lh == NULL) { | |
1844 | *status = U_MEMORY_ALLOCATION_ERROR; | |
1845 | return; | |
1846 | } | |
374ca955 | 1847 | uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); |
b75a7d8f A |
1848 | src->resultLen = 0; |
1849 | ||
1850 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); | |
1851 | ||
1852 | // UCOL_RESET_TOP_VALUE | |
374ca955 | 1853 | setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); |
b75a7d8f A |
1854 | // UCOL_FIRST_PRIMARY_IGNORABLE |
1855 | setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); | |
1856 | // UCOL_LAST_PRIMARY_IGNORABLE | |
1857 | setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); | |
1858 | // UCOL_FIRST_SECONDARY_IGNORABLE | |
1859 | setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); | |
1860 | // UCOL_LAST_SECONDARY_IGNORABLE | |
1861 | setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); | |
1862 | // UCOL_FIRST_TERTIARY_IGNORABLE | |
1863 | setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); | |
1864 | // UCOL_LAST_TERTIARY_IGNORABLE | |
1865 | setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); | |
1866 | // UCOL_FIRST_VARIABLE | |
1867 | setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); | |
1868 | // UCOL_LAST_VARIABLE | |
1869 | setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); | |
1870 | // UCOL_FIRST_NON_VARIABLE | |
1871 | setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); | |
1872 | // UCOL_LAST_NON_VARIABLE | |
1873 | setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); | |
1874 | // UCOL_FIRST_IMPLICIT | |
1875 | setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); | |
1876 | // UCOL_LAST_IMPLICIT | |
1877 | setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING); | |
1878 | // UCOL_FIRST_TRAILING | |
1879 | setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); | |
1880 | // UCOL_LAST_TRAILING | |
1881 | setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); | |
1882 | ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); | |
1883 | } | |
1884 | ||
1885 | ||
1886 | void ucol_tok_closeTokenList(UColTokenParser *src) { | |
1887 | if(src->copySet != NULL) { | |
1888 | uset_close(src->copySet); | |
1889 | } | |
1890 | if(src->removeSet != NULL) { | |
1891 | uset_close(src->removeSet); | |
1892 | } | |
1893 | if(src->tailored != NULL) { | |
1894 | uhash_close(src->tailored); | |
1895 | } | |
1896 | if(src->lh != NULL) { | |
1897 | uprv_free(src->lh); | |
1898 | } | |
1899 | if(src->source != NULL) { | |
1900 | uprv_free(src->source); | |
1901 | } | |
1902 | if(src->opts != NULL) { | |
1903 | uprv_free(src->opts); | |
1904 | } | |
1905 | } | |
1906 | ||
1907 | #endif /* #if !UCONFIG_NO_COLLATION */ | |
1908 |