]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol_tok.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_tok.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_tok.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module reads a tailoring rule string and produces a list of
17 * tokens that will be turned into collation elements
18 *
19 */
20
21 #include "unicode/utypes.h"
22
23 #if !UCONFIG_NO_COLLATION
24
25 #include "unicode/uscript.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/uniset.h"
29
30 #include "cmemory.h"
31 #include "cstring.h"
32 #include "ucol_bld.h"
33 #include "ucol_tok.h"
34 #include "ulocimp.h"
35 #include "uresimp.h"
36 #include "util.h"
37
38 // Define this only for debugging.
39 // #define DEBUG_FOR_COLL_RULES 1
40
41 #ifdef DEBUG_FOR_COLL_RULES
42 #include <iostream>
43 #endif
44
45 U_NAMESPACE_USE
46
47 U_CDECL_BEGIN
48 static int32_t U_CALLCONV
49 uhash_hashTokens(const UHashTok k)
50 {
51 int32_t hash = 0;
52 //uint32_t key = (uint32_t)k.integer;
53 UColToken *key = (UColToken *)k.pointer;
54 if (key != 0) {
55 int32_t len = (key->source & 0xFF000000)>>24;
56 int32_t inc = ((len - 32) / 32) + 1;
57
58 const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
59 const UChar *limit = p + len;
60
61 while (p<limit) {
62 hash = (hash * 37) + *p;
63 p += inc;
64 }
65 }
66 return hash;
67 }
68
69 static UBool U_CALLCONV
70 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
71 {
72 //uint32_t p1 = (uint32_t) key1.integer;
73 //uint32_t p2 = (uint32_t) key2.integer;
74 UColToken *p1 = (UColToken *)key1.pointer;
75 UColToken *p2 = (UColToken *)key2.pointer;
76 const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
77 const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
80 const UChar *end = s1+s1L-1;
81
82 if (p1 == p2) {
83 return TRUE;
84 }
85 if (p1->source == 0 || p2->source == 0) {
86 return FALSE;
87 }
88 if(s1L != s2L) {
89 return FALSE;
90 }
91 if(p1->source == p2->source) {
92 return TRUE;
93 }
94 while((s1 < end) && *s1 == *s2) {
95 ++s1;
96 ++s2;
97 }
98 if(*s1 == *s2) {
99 return TRUE;
100 } else {
101 return FALSE;
102 }
103 }
104 U_CDECL_END
105
106 /*
107 * Debug messages used to pinpoint where a format error occurred.
108 * A better way is to include context-sensitive information in syntaxError() function.
109 *
110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
111 * in the compile line.
112 */
113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */
114
115 #ifdef DEBUG_FOR_FORMAT_ERROR
116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
117 #else
118 #define DBG_FORMAT_ERROR
119 #endif
120
121
122 /*
123 * Controls debug messages so that the output can be compared before and after a
124 * big change. Prints the information of every code point that comes out of the
125 * collation parser and its strength into a file. When a big change in format
126 * happens, the files before and after the change should be identical.
127 *
128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
129 * in the compile line.
130 */
131 // #define DEBUG_FOR_CODE_POINTS 1
132
133 #ifdef DEBUG_FOR_CODE_POINTS
134 FILE* dfcp_fp = NULL;
135 #endif
136
137
138 /*static inline void U_CALLCONV
139 uhash_freeBlockWrapper(void *obj) {
140 uhash_freeBlock(obj);
141 }*/
142
143
144 typedef struct {
145 uint32_t startCE;
146 uint32_t startContCE;
147 uint32_t limitCE;
148 uint32_t limitContCE;
149 } indirectBoundaries;
150
151 /* these values are used for finding CE values for indirect positioning. */
152 /* Indirect positioning is a mechanism for allowing resets on symbolic */
153 /* values. It only works for resets and you cannot tailor indirect names */
154 /* An indirect name can define either an anchor point or a range. An */
155 /* anchor point behaves in exactly the same way as a code point in reset */
156 /* would, except that it cannot be tailored. A range (we currently only */
157 /* know for the [top] range will explicitly set the upper bound for */
158 /* generated CEs, thus allowing for better control over how many CEs can */
159 /* be squeezed between in the range without performance penalty. */
160 /* In that respect, we use [top] for tailoring of locales that use CJK */
161 /* characters. Other indirect values are currently a pure convenience, */
162 /* they can be used to assure that the CEs will be always positioned in */
163 /* the same place relative to a point with known properties (e.g. first */
164 /* primary ignorable). */
165 static indirectBoundaries ucolIndirectBoundaries[15];
166 /*
167 static indirectBoundaries ucolIndirectBoundaries[11] = {
168 { UCOL_RESET_TOP_VALUE, 0,
169 UCOL_NEXT_TOP_VALUE, 0 },
170 { UCOL_FIRST_PRIMARY_IGNORABLE, 0,
171 0, 0 },
172 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
173 0, 0 },
174 { UCOL_FIRST_SECONDARY_IGNORABLE, 0,
175 0, 0 },
176 { UCOL_LAST_SECONDARY_IGNORABLE, 0,
177 0, 0 },
178 { UCOL_FIRST_TERTIARY_IGNORABLE, 0,
179 0, 0 },
180 { UCOL_LAST_TERTIARY_IGNORABLE, 0,
181 0, 0 },
182 { UCOL_FIRST_VARIABLE, 0,
183 0, 0 },
184 { UCOL_LAST_VARIABLE, 0,
185 0, 0 },
186 { UCOL_FIRST_NON_VARIABLE, 0,
187 0, 0 },
188 { UCOL_LAST_NON_VARIABLE, 0,
189 0, 0 },
190 };
191 */
192
193 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
194
195 // Set values for the top - TODO: once we have values for all the indirects, we are going
196 // to initalize here.
197 ucolIndirectBoundaries[indexR].startCE = start[0];
198 ucolIndirectBoundaries[indexR].startContCE = start[1];
199 if(end) {
200 ucolIndirectBoundaries[indexR].limitCE = end[0];
201 ucolIndirectBoundaries[indexR].limitContCE = end[1];
202 } else {
203 ucolIndirectBoundaries[indexR].limitCE = 0;
204 ucolIndirectBoundaries[indexR].limitContCE = 0;
205 }
206 }
207
208
209 static inline
210 void syntaxError(const UChar* rules,
211 int32_t pos,
212 int32_t rulesLen,
213 UParseError* parseError)
214 {
215 parseError->offset = pos;
216 parseError->line = 0 ; /* we are not using line numbers */
217
218 // for pre-context
219 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
220 int32_t stop = pos;
221
222 u_memcpy(parseError->preContext,rules+start,stop-start);
223 //null terminate the buffer
224 parseError->preContext[stop-start] = 0;
225
226 //for post-context
227 start = pos+1;
228 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
229 rulesLen;
230
231 if(start < stop) {
232 u_memcpy(parseError->postContext,rules+start,stop-start);
233 //null terminate the buffer
234 parseError->postContext[stop-start]= 0;
235 } else {
236 parseError->postContext[0] = 0;
237 }
238 }
239
240 static
241 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
242 switch(attrib) {
243 case UCOL_HIRAGANA_QUATERNARY_MODE:
244 opts->hiraganaQ = value;
245 break;
246 case UCOL_FRENCH_COLLATION:
247 opts->frenchCollation = value;
248 break;
249 case UCOL_ALTERNATE_HANDLING:
250 opts->alternateHandling = value;
251 break;
252 case UCOL_CASE_FIRST:
253 opts->caseFirst = value;
254 break;
255 case UCOL_CASE_LEVEL:
256 opts->caseLevel = value;
257 break;
258 case UCOL_NORMALIZATION_MODE:
259 opts->normalizationMode = value;
260 break;
261 case UCOL_STRENGTH:
262 opts->strength = value;
263 break;
264 case UCOL_NUMERIC_COLLATION:
265 opts->numericCollation = value;
266 break;
267 case UCOL_ATTRIBUTE_COUNT:
268 default:
269 break;
270 }
271 }
272
273 #define UTOK_OPTION_COUNT 22
274
275 static UBool didInit = FALSE;
276 /* we can be strict, or we can be lenient */
277 /* I'd surely be lenient with the option arguments */
278 /* maybe even with options */
279 U_STRING_DECL(suboption_00, "non-ignorable", 13);
280 U_STRING_DECL(suboption_01, "shifted", 7);
281
282 U_STRING_DECL(suboption_02, "lower", 5);
283 U_STRING_DECL(suboption_03, "upper", 5);
284 U_STRING_DECL(suboption_04, "off", 3);
285 U_STRING_DECL(suboption_05, "on", 2);
286 U_STRING_DECL(suboption_06, "1", 1);
287 U_STRING_DECL(suboption_07, "2", 1);
288 U_STRING_DECL(suboption_08, "3", 1);
289 U_STRING_DECL(suboption_09, "4", 1);
290 U_STRING_DECL(suboption_10, "I", 1);
291
292 U_STRING_DECL(suboption_11, "primary", 7);
293 U_STRING_DECL(suboption_12, "secondary", 9);
294 U_STRING_DECL(suboption_13, "tertiary", 8);
295 U_STRING_DECL(suboption_14, "variable", 8);
296 U_STRING_DECL(suboption_15, "regular", 7);
297 U_STRING_DECL(suboption_16, "implicit", 8);
298 U_STRING_DECL(suboption_17, "trailing", 8);
299
300
301 U_STRING_DECL(option_00, "undefined", 9);
302 U_STRING_DECL(option_01, "rearrange", 9);
303 U_STRING_DECL(option_02, "alternate", 9);
304 U_STRING_DECL(option_03, "backwards", 9);
305 U_STRING_DECL(option_04, "variable top", 12);
306 U_STRING_DECL(option_05, "top", 3);
307 U_STRING_DECL(option_06, "normalization", 13);
308 U_STRING_DECL(option_07, "caseLevel", 9);
309 U_STRING_DECL(option_08, "caseFirst", 9);
310 U_STRING_DECL(option_09, "scriptOrder", 11);
311 U_STRING_DECL(option_10, "charsetname", 11);
312 U_STRING_DECL(option_11, "charset", 7);
313 U_STRING_DECL(option_12, "before", 6);
314 U_STRING_DECL(option_13, "hiraganaQ", 9);
315 U_STRING_DECL(option_14, "strength", 8);
316 U_STRING_DECL(option_15, "first", 5);
317 U_STRING_DECL(option_16, "last", 4);
318 U_STRING_DECL(option_17, "optimize", 8);
319 U_STRING_DECL(option_18, "suppressContractions", 20);
320 U_STRING_DECL(option_19, "numericOrdering", 15);
321 U_STRING_DECL(option_20, "import", 6);
322 U_STRING_DECL(option_21, "reorder", 7);
323
324 /*
325 [last variable] last variable value
326 [last primary ignorable] largest CE for primary ignorable
327 [last secondary ignorable] largest CE for secondary ignorable
328 [last tertiary ignorable] largest CE for tertiary ignorable
329 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
330 */
331
332
333 static const ucolTokSuboption alternateSub[2] = {
334 {suboption_00, 13, UCOL_NON_IGNORABLE},
335 {suboption_01, 7, UCOL_SHIFTED}
336 };
337
338 static const ucolTokSuboption caseFirstSub[3] = {
339 {suboption_02, 5, UCOL_LOWER_FIRST},
340 {suboption_03, 5, UCOL_UPPER_FIRST},
341 {suboption_04, 3, UCOL_OFF},
342 };
343
344 static const ucolTokSuboption onOffSub[2] = {
345 {suboption_04, 3, UCOL_OFF},
346 {suboption_05, 2, UCOL_ON}
347 };
348
349 static const ucolTokSuboption frenchSub[1] = {
350 {suboption_07, 1, UCOL_ON}
351 };
352
353 static const ucolTokSuboption beforeSub[3] = {
354 {suboption_06, 1, UCOL_PRIMARY},
355 {suboption_07, 1, UCOL_SECONDARY},
356 {suboption_08, 1, UCOL_TERTIARY}
357 };
358
359 static const ucolTokSuboption strengthSub[5] = {
360 {suboption_06, 1, UCOL_PRIMARY},
361 {suboption_07, 1, UCOL_SECONDARY},
362 {suboption_08, 1, UCOL_TERTIARY},
363 {suboption_09, 1, UCOL_QUATERNARY},
364 {suboption_10, 1, UCOL_IDENTICAL},
365 };
366
367 static const ucolTokSuboption firstLastSub[7] = {
368 {suboption_11, 7, UCOL_PRIMARY},
369 {suboption_12, 9, UCOL_PRIMARY},
370 {suboption_13, 8, UCOL_PRIMARY},
371 {suboption_14, 8, UCOL_PRIMARY},
372 {suboption_15, 7, UCOL_PRIMARY},
373 {suboption_16, 8, UCOL_PRIMARY},
374 {suboption_17, 8, UCOL_PRIMARY},
375 };
376
377 enum OptionNumber {
378 OPTION_ALTERNATE_HANDLING = 0,
379 OPTION_FRENCH_COLLATION,
380 OPTION_CASE_LEVEL,
381 OPTION_CASE_FIRST,
382 OPTION_NORMALIZATION_MODE,
383 OPTION_HIRAGANA_QUATERNARY,
384 OPTION_STRENGTH,
385 OPTION_NUMERIC_COLLATION,
386 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
387 OPTION_VARIABLE_TOP,
388 OPTION_REARRANGE,
389 OPTION_BEFORE,
390 OPTION_TOP,
391 OPTION_FIRST,
392 OPTION_LAST,
393 OPTION_OPTIMIZE,
394 OPTION_SUPPRESS_CONTRACTIONS,
395 OPTION_UNDEFINED,
396 OPTION_SCRIPT_ORDER,
397 OPTION_CHARSET_NAME,
398 OPTION_CHARSET,
399 OPTION_IMPORT,
400 OPTION_SCRIPTREORDER
401 } ;
402
403 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
404 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
405 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */
406 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */
407 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */
408 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
409 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
410 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
411 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/
412 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
413 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
414 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
415 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
416 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
417 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
418 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
419 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
420 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
421 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
422 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
423 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */
424 /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */
425 /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */
426 };
427
428 static
429 int32_t u_strncmpNoCase(const UChar *s1,
430 const UChar *s2,
431 int32_t n)
432 {
433 if(n > 0) {
434 int32_t rc;
435 for(;;) {
436 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
437 if(rc != 0 || *s1 == 0 || --n == 0) {
438 return rc;
439 }
440 ++s1;
441 ++s2;
442 }
443 }
444 return 0;
445 }
446
447 static
448 void ucol_uprv_tok_initData() {
449 if(!didInit) {
450 U_STRING_INIT(suboption_00, "non-ignorable", 13);
451 U_STRING_INIT(suboption_01, "shifted", 7);
452
453 U_STRING_INIT(suboption_02, "lower", 5);
454 U_STRING_INIT(suboption_03, "upper", 5);
455 U_STRING_INIT(suboption_04, "off", 3);
456 U_STRING_INIT(suboption_05, "on", 2);
457
458 U_STRING_INIT(suboption_06, "1", 1);
459 U_STRING_INIT(suboption_07, "2", 1);
460 U_STRING_INIT(suboption_08, "3", 1);
461 U_STRING_INIT(suboption_09, "4", 1);
462 U_STRING_INIT(suboption_10, "I", 1);
463
464 U_STRING_INIT(suboption_11, "primary", 7);
465 U_STRING_INIT(suboption_12, "secondary", 9);
466 U_STRING_INIT(suboption_13, "tertiary", 8);
467 U_STRING_INIT(suboption_14, "variable", 8);
468 U_STRING_INIT(suboption_15, "regular", 7);
469 U_STRING_INIT(suboption_16, "implicit", 8);
470 U_STRING_INIT(suboption_17, "trailing", 8);
471
472
473 U_STRING_INIT(option_00, "undefined", 9);
474 U_STRING_INIT(option_01, "rearrange", 9);
475 U_STRING_INIT(option_02, "alternate", 9);
476 U_STRING_INIT(option_03, "backwards", 9);
477 U_STRING_INIT(option_04, "variable top", 12);
478 U_STRING_INIT(option_05, "top", 3);
479 U_STRING_INIT(option_06, "normalization", 13);
480 U_STRING_INIT(option_07, "caseLevel", 9);
481 U_STRING_INIT(option_08, "caseFirst", 9);
482 U_STRING_INIT(option_09, "scriptOrder", 11);
483 U_STRING_INIT(option_10, "charsetname", 11);
484 U_STRING_INIT(option_11, "charset", 7);
485 U_STRING_INIT(option_12, "before", 6);
486 U_STRING_INIT(option_13, "hiraganaQ", 9);
487 U_STRING_INIT(option_14, "strength", 8);
488 U_STRING_INIT(option_15, "first", 5);
489 U_STRING_INIT(option_16, "last", 4);
490 U_STRING_INIT(option_17, "optimize", 8);
491 U_STRING_INIT(option_18, "suppressContractions", 20);
492 U_STRING_INIT(option_19, "numericOrdering", 15);
493 U_STRING_INIT(option_20, "import ", 6);
494 U_STRING_INIT(option_21, "reorder", 7);
495 didInit = TRUE;
496 }
497 }
498
499
500 // This function reads basic options to set in the runtime collator
501 // used by data driven tests. Should not support build time options
502 U_CAPI const UChar * U_EXPORT2
503 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
504 UColAttribute *attrib, UColAttributeValue *value,
505 UErrorCode *status)
506 {
507 uint32_t i = 0;
508 int32_t j=0;
509 UBool foundOption = FALSE;
510 const UChar *optionArg = NULL;
511
512 ucol_uprv_tok_initData();
513
514 while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start))) { /* eat whitespace */
515 start++;
516 }
517 if(start >= end) {
518 return NULL;
519 }
520 /* skip opening '[' */
521 if(*start == 0x005b) {
522 start++;
523 } else {
524 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
525 return NULL;
526 }
527
528 while(i < UTOK_OPTION_COUNT) {
529 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
530 foundOption = TRUE;
531 if(end - start > rulesOptions[i].optionLen) {
532 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
533 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
534 optionArg++;
535 }
536 }
537 break;
538 }
539 i++;
540 }
541
542 if(!foundOption) {
543 *status = U_ILLEGAL_ARGUMENT_ERROR;
544 return NULL;
545 }
546
547 if(optionArg) {
548 for(j = 0; j<rulesOptions[i].subSize; j++) {
549 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
550 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
551 *attrib = rulesOptions[i].attr;
552 *value = rulesOptions[i].subopts[j].attrVal;
553 optionArg += rulesOptions[i].subopts[j].subLen;
554 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
555 optionArg++;
556 }
557 if(*optionArg == 0x005d) {
558 optionArg++;
559 return optionArg;
560 } else {
561 *status = U_ILLEGAL_ARGUMENT_ERROR;
562 return NULL;
563 }
564 }
565 }
566 }
567 *status = U_ILLEGAL_ARGUMENT_ERROR;
568 return NULL;
569 }
570
571 static
572 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
573 while(*start != 0x005b) { /* advance while we find the first '[' */
574 start++;
575 }
576 // now we need to get a balanced set of '[]'. The problem is that a set can have
577 // many, and *end point to the first closing '['
578 int32_t noOpenBraces = 1;
579 int32_t current = 1; // skip the opening brace
580 while(start+current < end && noOpenBraces != 0) {
581 if(start[current] == 0x005b) {
582 noOpenBraces++;
583 } else if(start[current] == 0x005D) { // closing brace
584 noOpenBraces--;
585 }
586 current++;
587 }
588
589 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
590 *status = U_ILLEGAL_ARGUMENT_ERROR;
591 return NULL;
592 }
593 return uset_openPattern(start, current, status);
594 }
595
596 /**
597 * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
598 * @param start Pointer to the start UChar.
599 * @param end Pointer to the last valid pointer beyond which the option will not extend.
600 * @param optionArg Address of the pointer at which the options start (after the option name)
601 * @return The index of the option, or -1 if the option is not valid.
602 */
603 static
604 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
605 int32_t i = 0;
606 ucol_uprv_tok_initData();
607
608 while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whitespace */
609 start++;
610 }
611 while(i < UTOK_OPTION_COUNT) {
612 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
613 if(end - start > rulesOptions[i].optionLen) {
614 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
615 while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**optionArg)) { /* eat whitespace */
616 (*optionArg)++;
617 }
618 }
619 break;
620 }
621 i++;
622 }
623 if(i == UTOK_OPTION_COUNT) {
624 i = -1; // didn't find an option
625 }
626 return i;
627 }
628
629
630 static
631 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
632 int32_t codeCount = 0;
633 int32_t codeIndex = 0;
634 char conversion[64];
635 int32_t tokenLength = 0;
636 const UChar* space;
637
638 const UChar* current = src->current;
639 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
640
641 // eat leading whitespace
642 while(current < end && u_isWhitespace(*current)) {
643 current++;
644 }
645
646 while(current < end) {
647 space = u_memchr(current, 0x0020, end - current);
648 space = space == 0 ? end : space;
649 tokenLength = space - current;
650 if (tokenLength < 4) {
651 *status = U_INVALID_FORMAT_ERROR;
652 return;
653 }
654 codeCount++;
655 current += tokenLength;
656 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
657 ++current;
658 }
659 }
660
661 if (codeCount == 0) {
662 *status = U_INVALID_FORMAT_ERROR;
663 }
664
665 src->reorderCodesLength = codeCount;
666 src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
667 current = src->current;
668
669 // eat leading whitespace
670 while(current < end && u_isWhitespace(*current)) {
671 current++;
672 }
673
674 while(current < end) {
675 space = u_memchr(current, 0x0020, end - current);
676 space = space == 0 ? end : space;
677 tokenLength = space - current;
678 if (tokenLength < 4) {
679 *status = U_ILLEGAL_ARGUMENT_ERROR;
680 return;
681 } else {
682 u_UCharsToChars(current, conversion, tokenLength);
683 conversion[tokenLength] = '\0';
684 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
685 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
686 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
687 }
688 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
689 *status = U_ILLEGAL_ARGUMENT_ERROR;
690 }
691 }
692 codeIndex++;
693 current += tokenLength;
694 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
695 ++current;
696 }
697 }
698 }
699
700 // reads and conforms to various options in rules
701 // end is the position of the first closing ']'
702 // However, some of the options take an UnicodeSet definition
703 // which needs to duplicate the closing ']'
704 // for example: '[copy [\uAC00-\uD7FF]]'
705 // These options will move end to the second ']' and the
706 // caller will set the current to it.
707 static
708 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
709 const UChar* start = src->current;
710 int32_t i = 0;
711 int32_t j=0;
712 const UChar *optionArg = NULL;
713
714 uint8_t result = 0;
715
716 start++; /*skip opening '['*/
717 i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
718 if(optionArg) {
719 src->current = optionArg;
720 }
721
722 if(i < 0) {
723 *status = U_ILLEGAL_ARGUMENT_ERROR;
724 } else {
725 int32_t noOpenBraces = 1;
726 switch(i) {
727 case OPTION_ALTERNATE_HANDLING:
728 case OPTION_FRENCH_COLLATION:
729 case OPTION_CASE_LEVEL:
730 case OPTION_CASE_FIRST:
731 case OPTION_NORMALIZATION_MODE:
732 case OPTION_HIRAGANA_QUATERNARY:
733 case OPTION_STRENGTH:
734 case OPTION_NUMERIC_COLLATION:
735 if(optionArg) {
736 for(j = 0; j<rulesOptions[i].subSize; j++) {
737 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
738 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
739 result = UCOL_TOK_SUCCESS;
740 }
741 }
742 }
743 if(result == 0) {
744 *status = U_ILLEGAL_ARGUMENT_ERROR;
745 }
746 break;
747 case OPTION_VARIABLE_TOP:
748 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
749 break;
750 case OPTION_REARRANGE:
751 result = UCOL_TOK_SUCCESS;
752 break;
753 case OPTION_BEFORE:
754 if(optionArg) {
755 for(j = 0; j<rulesOptions[i].subSize; j++) {
756 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
757 result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
758 }
759 }
760 }
761 if(result == 0) {
762 *status = U_ILLEGAL_ARGUMENT_ERROR;
763 }
764 break;
765 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
766 /* index to this array will be src->parsedToken.indirectIndex*/
767 src->parsedToken.indirectIndex = 0;
768 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
769 break;
770 case OPTION_FIRST:
771 case OPTION_LAST: /* first, last */
772 for(j = 0; j<rulesOptions[i].subSize; j++) {
773 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
774 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
775 // element of indirect boundaries is reserved for top.
776 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
777 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
778 }
779 }
780 if(result == 0) {
781 *status = U_ILLEGAL_ARGUMENT_ERROR;
782 }
783 break;
784 case OPTION_OPTIMIZE:
785 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
786 // we need to move end here
787 src->current++; // skip opening brace
788 while(src->current < src->end && noOpenBraces != 0) {
789 if(*src->current == 0x005b) {
790 noOpenBraces++;
791 } else if(*src->current == 0x005D) { // closing brace
792 noOpenBraces--;
793 }
794 src->current++;
795 }
796 result = UCOL_TOK_SUCCESS;
797 break;
798 case OPTION_SCRIPTREORDER:
799 ucol_tok_parseScriptReorder(src, status);
800 break;
801 default:
802 *status = U_UNSUPPORTED_ERROR;
803 break;
804 }
805 }
806 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
807 return result;
808 }
809
810
811 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
812 if (stuff == NULL || len <= 0) {
813 return;
814 }
815 UnicodeString tempStuff(FALSE, stuff, len);
816 if(src->extraCurrent+len >= src->extraEnd) {
817 /* reallocate */
818 if (stuff >= src->source && stuff <= src->end) {
819 // Copy the "stuff" contents into tempStuff's own buffer.
820 // UnicodeString is copy-on-write.
821 if (len > 0) {
822 tempStuff.setCharAt(0, tempStuff[0]);
823 } else {
824 tempStuff.remove();
825 }
826 }
827 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
828 if(newSrc != NULL) {
829 src->current = newSrc + (src->current - src->source);
830 src->extraCurrent = newSrc + (src->extraCurrent - src->source);
831 src->end = newSrc + (src->end - src->source);
832 src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
833 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
834 src->source = newSrc;
835 } else {
836 *status = U_MEMORY_ALLOCATION_ERROR;
837 return;
838 }
839 }
840 if(len == 1) {
841 *src->extraCurrent++ = tempStuff[0];
842 } else {
843 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
844 src->extraCurrent += len;
845 }
846 }
847
848 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
849 /*
850 top = TRUE;
851 */
852 UChar buff[5];
853 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
854 buff[0] = 0xFFFE;
855 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
856 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
857 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
858 src->parsedToken.charsLen = 3;
859 ucol_tok_addToExtraCurrent(src, buff, 3, status);
860 } else {
861 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
862 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
863 src->parsedToken.charsLen = 5;
864 ucol_tok_addToExtraCurrent(src, buff, 5, status);
865 }
866 return TRUE;
867 }
868
869 static UBool isCharNewLine(UChar c){
870 switch(c){
871 case 0x000A: /* LF */
872 case 0x000D: /* CR */
873 case 0x000C: /* FF */
874 case 0x0085: /* NEL */
875 case 0x2028: /* LS */
876 case 0x2029: /* PS */
877 return TRUE;
878 default:
879 return FALSE;
880 }
881 }
882
883 /*
884 * This function is called several times when a range is processed. Each time, the next code point
885 * is processed.
886 * The following variables must be set before calling this function:
887 * src->currentRangeCp: The current code point to process.
888 * src->lastRangeCp: The last code point in the range.
889 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
890 */
891 static const UChar*
892 ucol_tok_processNextCodePointInRange(UColTokenParser *src,
893 UErrorCode *status)
894 {
895 // Append current code point to source
896 UChar buff[U16_MAX_LENGTH];
897 uint32_t i = 0;
898
899 uint32_t nChars = U16_LENGTH(src->currentRangeCp);
900 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
901 src->parsedToken.charsLen = nChars;
902
903 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
904 ucol_tok_addToExtraCurrent(src, buff, nChars, status);
905
906 ++src->currentRangeCp;
907 if (src->currentRangeCp > src->lastRangeCp) {
908 src->inRange = FALSE;
909
910 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
911 src->isStarred = FALSE;
912 }
913 } else {
914 src->previousCp = src->currentRangeCp;
915 }
916 return src->current;
917 }
918
919 /*
920 * This function is called several times when a starred list is processed. Each time, the next code point
921 * in the list is processed.
922 * The following variables must be set before calling this function:
923 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point.
924 * src->lastStarredCharIndex: Index to the last character in the list.
925 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
926 */
927 static const UChar*
928 ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
929 {
930 // Extract the characters corresponding to the next code point.
931 UChar32 cp;
932 src->parsedToken.charsOffset = src->currentStarredCharIndex;
933 int32_t prev = src->currentStarredCharIndex;
934 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
935 src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
936
937 // When we are done parsing the starred string, turn the flag off so that
938 // the normal processing is restored.
939 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
940 src->isStarred = FALSE;
941 }
942 src->previousCp = cp;
943 return src->current;
944 }
945
946 /*
947 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
948 *
949 * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
950 * # : Comment character
951 * & : Reset operator
952 * = : Equality
953 * < : Primary collation
954 * << : Secondary collation
955 * <<< : Tertiary collation
956 * ; : Secondary collation
957 * , : Tertiary collation
958 * / : Expansions
959 * | : Prefix
960 * - : Range
961
962 * ! : Java Thai modifier, ignored
963 * @ : French only
964
965 * [] : Options
966 * '' : Quotes
967 *
968 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz
969 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
970 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a",
971 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous
972 * character returned as cached so that the calling program can do further splitting.
973 */
974 static const UChar*
975 ucol_tok_parseNextTokenInternal(UColTokenParser *src,
976 UBool startOfRules,
977 UParseError *parseError,
978 UErrorCode *status)
979 {
980 UBool variableTop = FALSE;
981 UBool top = FALSE;
982 UBool inChars = TRUE;
983 UBool inQuote = FALSE;
984 UBool wasInQuote = FALSE;
985 uint8_t before = 0;
986 UBool isEscaped = FALSE;
987
988 // TODO: replace these variables with src->parsedToken counterparts
989 // no need to use them anymore since we have src->parsedToken.
990 // Ideally, token parser would be a nice class... Once, when I have
991 // more time (around 2020 probably).
992 uint32_t newExtensionLen = 0;
993 uint32_t extensionOffset = 0;
994 uint32_t newStrength = UCOL_TOK_UNSET;
995 UChar buff[10];
996
997 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
998 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
999 src->parsedToken.indirectIndex = 0;
1000
1001 while (src->current < src->end) {
1002 UChar ch = *(src->current);
1003
1004 if (inQuote) {
1005 if (ch == 0x0027/*'\''*/) {
1006 inQuote = FALSE;
1007 } else {
1008 if ((src->parsedToken.charsLen == 0) || inChars) {
1009 if(src->parsedToken.charsLen == 0) {
1010 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1011 }
1012 src->parsedToken.charsLen++;
1013 } else {
1014 if(newExtensionLen == 0) {
1015 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1016 }
1017 newExtensionLen++;
1018 }
1019 }
1020 }else if(isEscaped){
1021 isEscaped =FALSE;
1022 if (newStrength == UCOL_TOK_UNSET) {
1023 *status = U_INVALID_FORMAT_ERROR;
1024 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1025 DBG_FORMAT_ERROR
1026 return NULL;
1027 // enabling rules to start with non-tokens a < b
1028 // newStrength = UCOL_TOK_RESET;
1029 }
1030 if(ch != 0x0000 && src->current != src->end) {
1031 if (inChars) {
1032 if(src->parsedToken.charsLen == 0) {
1033 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1034 }
1035 src->parsedToken.charsLen++;
1036 } else {
1037 if(newExtensionLen == 0) {
1038 extensionOffset = (uint32_t)(src->current - src->source);
1039 }
1040 newExtensionLen++;
1041 }
1042 }
1043 }else {
1044 if(!uprv_isRuleWhiteSpace(ch)) {
1045 /* Sets the strength for this entry */
1046 switch (ch) {
1047 case 0x003D/*'='*/ :
1048 if (newStrength != UCOL_TOK_UNSET) {
1049 goto EndOfLoop;
1050 }
1051
1052 /* if we start with strength, we'll reset to top */
1053 if(startOfRules == TRUE) {
1054 src->parsedToken.indirectIndex = 5;
1055 top = ucol_tok_doSetTop(src, status);
1056 newStrength = UCOL_TOK_RESET;
1057 goto EndOfLoop;
1058 }
1059 newStrength = UCOL_IDENTICAL;
1060 if(*(src->current+1) == 0x002A) {/*'*'*/
1061 src->current++;
1062 src->isStarred = TRUE;
1063 }
1064 break;
1065
1066 case 0x002C/*','*/:
1067 if (newStrength != UCOL_TOK_UNSET) {
1068 goto EndOfLoop;
1069 }
1070
1071 /* if we start with strength, we'll reset to top */
1072 if(startOfRules == TRUE) {
1073 src->parsedToken.indirectIndex = 5;
1074 top = ucol_tok_doSetTop(src, status);
1075 newStrength = UCOL_TOK_RESET;
1076 goto EndOfLoop;
1077 }
1078 newStrength = UCOL_TERTIARY;
1079 break;
1080
1081 case 0x003B/*';'*/:
1082 if (newStrength != UCOL_TOK_UNSET) {
1083 goto EndOfLoop;
1084 }
1085
1086 /* if we start with strength, we'll reset to top */
1087 if(startOfRules == TRUE) {
1088 src->parsedToken.indirectIndex = 5;
1089 top = ucol_tok_doSetTop(src, status);
1090 newStrength = UCOL_TOK_RESET;
1091 goto EndOfLoop;
1092 }
1093 newStrength = UCOL_SECONDARY;
1094 break;
1095
1096 case 0x003C/*'<'*/:
1097 if (newStrength != UCOL_TOK_UNSET) {
1098 goto EndOfLoop;
1099 }
1100
1101 /* if we start with strength, we'll reset to top */
1102 if(startOfRules == TRUE) {
1103 src->parsedToken.indirectIndex = 5;
1104 top = ucol_tok_doSetTop(src, status);
1105 newStrength = UCOL_TOK_RESET;
1106 goto EndOfLoop;
1107 }
1108 /* before this, do a scan to verify whether this is */
1109 /* another strength */
1110 if(*(src->current+1) == 0x003C) {
1111 src->current++;
1112 if(*(src->current+1) == 0x003C) {
1113 src->current++; /* three in a row! */
1114 newStrength = UCOL_TERTIARY;
1115 } else { /* two in a row */
1116 newStrength = UCOL_SECONDARY;
1117 }
1118 } else { /* just one */
1119 newStrength = UCOL_PRIMARY;
1120 }
1121 if(*(src->current+1) == 0x002A) {/*'*'*/
1122 src->current++;
1123 src->isStarred = TRUE;
1124 }
1125 break;
1126
1127 case 0x0026/*'&'*/:
1128 if (newStrength != UCOL_TOK_UNSET) {
1129 /**/
1130 goto EndOfLoop;
1131 }
1132
1133 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
1134 break;
1135
1136 case 0x005b/*'['*/:
1137 /* options - read an option, analyze it */
1138 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
1139 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
1140 if(U_SUCCESS(*status)) {
1141 if(result & UCOL_TOK_TOP) {
1142 if(newStrength == UCOL_TOK_RESET) {
1143 top = ucol_tok_doSetTop(src, status);
1144 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
1145 src->parsedToken.charsLen+=2;
1146 buff[0] = 0x002d;
1147 buff[1] = before;
1148 ucol_tok_addToExtraCurrent(src, buff, 2, status);
1149 }
1150
1151 src->current++;
1152 goto EndOfLoop;
1153 } else {
1154 *status = U_INVALID_FORMAT_ERROR;
1155 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1156 DBG_FORMAT_ERROR
1157 }
1158 } else if(result & UCOL_TOK_VARIABLE_TOP) {
1159 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
1160 variableTop = TRUE;
1161 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1162 src->parsedToken.charsLen = 1;
1163 buff[0] = 0xFFFF;
1164 ucol_tok_addToExtraCurrent(src, buff, 1, status);
1165 src->current++;
1166 goto EndOfLoop;
1167 } else {
1168 *status = U_INVALID_FORMAT_ERROR;
1169 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1170 DBG_FORMAT_ERROR
1171 }
1172 } else if (result & UCOL_TOK_BEFORE){
1173 if(newStrength == UCOL_TOK_RESET) {
1174 before = result & UCOL_TOK_BEFORE;
1175 } else {
1176 *status = U_INVALID_FORMAT_ERROR;
1177 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1178 DBG_FORMAT_ERROR
1179 }
1180 }
1181 } else {
1182 *status = U_INVALID_FORMAT_ERROR;
1183 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1184 DBG_FORMAT_ERROR
1185 return NULL;
1186 }
1187 }
1188 break;
1189 case 0x0021/*! skip java thai modifier reordering*/:
1190 break;
1191 case 0x002F/*'/'*/:
1192 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
1193 inChars = FALSE; /* we're now processing expansion */
1194 break;
1195 case 0x005C /* back slash for escaped chars */:
1196 isEscaped = TRUE;
1197 break;
1198 /* found a quote, we're gonna start copying */
1199 case 0x0027/*'\''*/:
1200 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
1201 *status = U_INVALID_FORMAT_ERROR;
1202 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1203 DBG_FORMAT_ERROR
1204 return NULL;
1205 // enabling rules to start with a non-token character a < b
1206 // newStrength = UCOL_TOK_RESET;
1207 }
1208
1209 inQuote = TRUE;
1210
1211 if(inChars) { /* we're doing characters */
1212 if(wasInQuote == FALSE) {
1213 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1214 }
1215 if (src->parsedToken.charsLen != 0) {
1216 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1217 }
1218 src->parsedToken.charsLen++;
1219 } else { /* we're doing an expansion */
1220 if(wasInQuote == FALSE) {
1221 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1222 }
1223 if (newExtensionLen != 0) {
1224 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
1225 }
1226 newExtensionLen++;
1227 }
1228
1229 wasInQuote = TRUE;
1230
1231 ch = *(++(src->current));
1232 if(ch == 0x0027) { /* copy the double quote */
1233 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1234 inQuote = FALSE;
1235 }
1236 break;
1237
1238 /* '@' is french only if the strength is not currently set */
1239 /* if it is, it's just a regular character in collation rules */
1240 case 0x0040/*'@'*/:
1241 if (newStrength == UCOL_TOK_UNSET) {
1242 src->opts->frenchCollation = UCOL_ON;
1243 break;
1244 }
1245
1246 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1247 // we want to store read characters to the prefix part and continue reading
1248 // the characters (proper way would be to restart reading the chars, but in
1249 // that case we would have to complicate the token hasher, which I do not
1250 // intend to play with. Instead, we will do prefixes when prefixes are due
1251 // (before adding the elements).
1252 src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1253 src->parsedToken.prefixLen = src->parsedToken.charsLen;
1254
1255 if(inChars) { /* we're doing characters */
1256 if(wasInQuote == FALSE) {
1257 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1258 }
1259 if (src->parsedToken.charsLen != 0) {
1260 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1261 }
1262 src->parsedToken.charsLen++;
1263 }
1264
1265 wasInQuote = TRUE;
1266
1267 do {
1268 ch = *(++(src->current));
1269 // skip whitespace between '|' and the character
1270 } while (uprv_isRuleWhiteSpace(ch));
1271 break;
1272
1273 //charsOffset = 0;
1274 //newCharsLen = 0;
1275 //break; // We want to store the whole prefix/character sequence. If we break
1276 // the '|' is going to get lost.
1277
1278 case 0x002D /*-*/: /* A range. */
1279 if (newStrength != UCOL_TOK_UNSET) {
1280 // While processing the pending token, the isStarred field
1281 // is reset, so it needs to be saved for the next
1282 // invocation.
1283 src->savedIsStarred = src->isStarred;
1284 goto EndOfLoop;
1285 }
1286 src->isStarred = src->savedIsStarred;
1287
1288 // Ranges are valid only in starred tokens.
1289 if (!src->isStarred) {
1290 *status = U_INVALID_FORMAT_ERROR;
1291 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1292 DBG_FORMAT_ERROR
1293 return NULL;
1294 }
1295 newStrength = src->parsedToken.strength;
1296 src->inRange = TRUE;
1297 break;
1298
1299 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1300 do {
1301 ch = *(++(src->current));
1302 } while (!isCharNewLine(ch));
1303
1304 break;
1305 default:
1306 if (newStrength == UCOL_TOK_UNSET) {
1307 *status = U_INVALID_FORMAT_ERROR;
1308 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1309 DBG_FORMAT_ERROR
1310 return NULL;
1311 }
1312
1313 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1314 *status = U_INVALID_FORMAT_ERROR;
1315 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1316 DBG_FORMAT_ERROR
1317 return NULL;
1318 }
1319
1320 if(ch == 0x0000 && src->current+1 == src->end) {
1321 break;
1322 }
1323
1324 if (inChars) {
1325 if(src->parsedToken.charsLen == 0) {
1326 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1327 }
1328 src->parsedToken.charsLen++;
1329 } else {
1330 if(newExtensionLen == 0) {
1331 extensionOffset = (uint32_t)(src->current - src->source);
1332 }
1333 newExtensionLen++;
1334 }
1335
1336 break;
1337 }
1338 }
1339 }
1340
1341 if(wasInQuote) {
1342 if(ch != 0x27) {
1343 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1344 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1345 }
1346 }
1347 }
1348
1349 src->current++;
1350 }
1351
1352 EndOfLoop:
1353 wasInQuote = FALSE;
1354 if (newStrength == UCOL_TOK_UNSET) {
1355 return NULL;
1356 }
1357
1358 if (src->parsedToken.charsLen == 0 && top == FALSE) {
1359 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1360 *status = U_INVALID_FORMAT_ERROR;
1361 DBG_FORMAT_ERROR
1362 return NULL;
1363 }
1364
1365 src->parsedToken.strength = newStrength;
1366 src->parsedToken.extensionOffset = extensionOffset;
1367 src->parsedToken.extensionLen = newExtensionLen;
1368 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1369
1370 return src->current;
1371 }
1372
1373 /*
1374 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
1375 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
1376 *
1377 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
1378 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates
1379 * it to separate tokens and returns one by one. In order to do that, the necessary states are
1380 * cached as member variables of the token parser.
1381 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
1382 * starting character as a single list token (which is separated into individual characters here)
1383 * and as another list token starting with the last character in the range. Before expanding it
1384 * as a list of tokens, this function expands the range by filling the intermediate characters and
1385 * returns them one by one as separate tokens.
1386 * Necessary checks are done for invalid combinations.
1387 */
1388 U_CAPI const UChar* U_EXPORT2
1389 ucol_tok_parseNextToken(UColTokenParser *src,
1390 UBool startOfRules,
1391 UParseError *parseError,
1392 UErrorCode *status)
1393 {
1394 const UChar *nextToken;
1395
1396 if (src->inRange) {
1397 // We are not done processing a range. Continue it.
1398 return ucol_tok_processNextCodePointInRange(src, status);
1399 } else if (src->isStarred) {
1400 // We are not done processing a starred token. Continue it.
1401 return ucol_tok_processNextTokenInStarredList(src);
1402 }
1403
1404 // Get the next token.
1405 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
1406
1407 if (nextToken == NULL) {
1408 return NULL;
1409 }
1410
1411 if (src->inRange) {
1412 // A new range has started.
1413 // Check whether it is a chain of ranges with more than one hyphen.
1414 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
1415 *status = U_INVALID_FORMAT_ERROR;
1416 syntaxError(src->source,src->parsedToken.charsOffset-1,
1417 src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
1418 DBG_FORMAT_ERROR
1419 return NULL;
1420 }
1421
1422 // The current token indicates the second code point of the range.
1423 // Process just that, and then proceed with the star.
1424 src->currentStarredCharIndex = src->parsedToken.charsOffset;
1425 U16_NEXT(src->source, src->currentStarredCharIndex,
1426 (uint32_t)(src->end - src->source), src->lastRangeCp);
1427 if (src->lastRangeCp <= src->previousCp) {
1428 *status = U_INVALID_FORMAT_ERROR;
1429 syntaxError(src->source,src->parsedToken.charsOffset-1,
1430 src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1431 DBG_FORMAT_ERROR
1432 return NULL;
1433 }
1434
1435 // Set current range code point to process the range loop
1436 src->currentRangeCp = src->previousCp + 1;
1437
1438 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1439
1440 return ucol_tok_processNextCodePointInRange(src, status);
1441 } else if (src->isStarred) {
1442 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
1443 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
1444 // separated into several tokens and returned.
1445 src->currentStarredCharIndex = src->parsedToken.charsOffset;
1446 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1447
1448 return ucol_tok_processNextTokenInStarredList(src);
1449 } else {
1450 // Set previous codepoint
1451 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
1452 }
1453 return nextToken;
1454 }
1455
1456
1457 /*
1458 Processing Description
1459 1 Build a ListList. Each list has a header, which contains two lists (positive
1460 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1461 reset may be null.
1462 2 As you process, you keep a LAST pointer that points to the last token you
1463 handled.
1464
1465 */
1466
1467 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
1468 UParseError *parseError, UErrorCode *status)
1469 {
1470 if(src->resultLen == src->listCapacity) {
1471 // Unfortunately, this won't work, as we store addresses of lhs in token
1472 src->listCapacity *= 2;
1473 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1474 if(src->lh == NULL) {
1475 *status = U_MEMORY_ALLOCATION_ERROR;
1476 return NULL;
1477 }
1478 }
1479 /* do the reset thing */
1480 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1481 /* test for NULL */
1482 if (sourceToken == NULL) {
1483 *status = U_MEMORY_ALLOCATION_ERROR;
1484 return NULL;
1485 }
1486 sourceToken->rulesToParseHdl = &(src->source);
1487 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1488 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1489
1490 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1491 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1492
1493 // keep the flags around so that we know about before
1494 sourceToken->flags = src->parsedToken.flags;
1495
1496 if(src->parsedToken.prefixOffset != 0) {
1497 // this is a syntax error
1498 *status = U_INVALID_FORMAT_ERROR;
1499 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1500 DBG_FORMAT_ERROR
1501 uprv_free(sourceToken);
1502 return 0;
1503 } else {
1504 sourceToken->prefix = 0;
1505 }
1506
1507 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1508 sourceToken->strength = UCOL_TOK_RESET;
1509 sourceToken->next = NULL;
1510 sourceToken->previous = NULL;
1511 sourceToken->noOfCEs = 0;
1512 sourceToken->noOfExpCEs = 0;
1513 sourceToken->listHeader = &src->lh[src->resultLen];
1514
1515 src->lh[src->resultLen].first = NULL;
1516 src->lh[src->resultLen].last = NULL;
1517 src->lh[src->resultLen].first = NULL;
1518 src->lh[src->resultLen].last = NULL;
1519
1520 src->lh[src->resultLen].reset = sourceToken;
1521
1522 /*
1523 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1524 First convert all expansions into normal form. Examples:
1525 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1526 d * ... into &x * c/y * d * ...
1527 Note: reset values can never have expansions, although they can cause the
1528 very next item to have one. They may be contractions, if they are found
1529 earlier in the list.
1530 */
1531 *expandNext = 0;
1532 if(expand != NULL) {
1533 /* check to see if there is an expansion */
1534 if(src->parsedToken.charsLen > 1) {
1535 uint32_t resetCharsOffset;
1536 resetCharsOffset = (uint32_t)(expand - src->source);
1537 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1538 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1539 }
1540 }
1541
1542 src->resultLen++;
1543
1544 uhash_put(src->tailored, sourceToken, sourceToken, status);
1545
1546 return sourceToken;
1547 }
1548
1549 static
1550 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1551 if(U_FAILURE(*status)) {
1552 return NULL;
1553 }
1554 /* this is a virgin before - we need to fish the anchor from the UCA */
1555 collIterate s;
1556 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1557 uint32_t CE, SecondCE;
1558 uint32_t invPos;
1559 if(sourceToken != NULL) {
1560 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
1561 } else {
1562 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
1563 }
1564 if(U_FAILURE(*status)) {
1565 return NULL;
1566 }
1567
1568 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1569 baseContCE = ucol_getNextCE(src->UCA, &s, status);
1570 if(baseContCE == UCOL_NO_MORE_CES) {
1571 baseContCE = 0;
1572 }
1573
1574
1575 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1576 uint32_t ch = 0;
1577 uint32_t expandNext = 0;
1578 UColToken key;
1579
1580 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1581 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
1582 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1583 ch = uprv_uca_getCodePointFromRaw(raw-1);
1584 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1585 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
1586 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
1587
1588 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1589 *src->extraCurrent++ = 0xFFFE;
1590 *src->extraCurrent++ = (UChar)ch;
1591 src->parsedToken.charsLen++;
1592
1593 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1594 key.rulesToParseHdl = &(src->source);
1595
1596 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1597 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1598
1599 if(sourceToken == NULL) {
1600 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1601 if(isContinuation(SecondCE)) {
1602 src->lh[src->resultLen].baseContCE = SecondCE;
1603 } else {
1604 src->lh[src->resultLen].baseContCE = 0;
1605 }
1606 src->lh[src->resultLen].nextCE = 0;
1607 src->lh[src->resultLen].nextContCE = 0;
1608 src->lh[src->resultLen].previousCE = 0;
1609 src->lh[src->resultLen].previousContCE = 0;
1610
1611 src->lh[src->resultLen].indirect = FALSE;
1612
1613 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1614 }
1615
1616 } else {
1617 invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1618
1619 // we got the previous CE. Now we need to see if the difference between
1620 // the two CEs is really of the requested strength.
1621 // if it's a bigger difference (we asked for secondary and got primary), we
1622 // need to modify the CE.
1623 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1624 // adjust the strength
1625 // now we are in the situation where our baseCE should actually be modified in
1626 // order to get the CE in the right position.
1627 if(strength == UCOL_SECONDARY) {
1628 CE = baseCE - 0x0200;
1629 } else { // strength == UCOL_TERTIARY
1630 CE = baseCE - 0x02;
1631 }
1632 if(baseContCE) {
1633 if(strength == UCOL_SECONDARY) {
1634 SecondCE = baseContCE - 0x0200;
1635 } else { // strength == UCOL_TERTIARY
1636 SecondCE = baseContCE - 0x02;
1637 }
1638 }
1639 }
1640
1641 #if 0
1642 // the code below relies on getting a code point from the inverse table, in order to be
1643 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1644 // 1. There are many code points that have the same CE
1645 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1646 // Also, in case when there is no equivalent strength before an element, we have to actually
1647 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1648 // before a is a primary difference.
1649
1650 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1651
1652
1653 ch = CETable[3*invPos+2];
1654
1655 if((ch & UCOL_INV_SIZEMASK) != 0) {
1656 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1657 uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1658 ch = conts[offset];
1659 }
1660
1661 *src->extraCurrent++ = (UChar)ch;
1662 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1663 src->parsedToken.charsLen = 1;
1664
1665 // We got an UCA before. However, this might have been tailored.
1666 // example:
1667 // &\u30ca = \u306a
1668 // &[before 3]\u306a<<<\u306a|\u309d
1669
1670
1671 // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1672 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1673 key.rulesToParseHdl = &(src->source);
1674
1675 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1676 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1677 #endif
1678
1679 // here is how it should be. The situation such as &[before 1]a < x, should be
1680 // resolved exactly as if we wrote &a > x.
1681 // therefore, I don't really care if the UCA value before a has been changed.
1682 // However, I do care if the strength between my element and the previous element
1683 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1684 // have to construct the base CE.
1685
1686
1687
1688 // if we found a tailored thing, we have to use the UCA value and construct
1689 // a new reset token with constructed name
1690 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1691 // character to which we want to anchor is already tailored.
1692 // We need to construct a new token which will be the anchor
1693 // point
1694 //*(src->extraCurrent-1) = 0xFFFE;
1695 //*src->extraCurrent++ = (UChar)ch;
1696 // grab before
1697 src->parsedToken.charsOffset -= 10;
1698 src->parsedToken.charsLen += 10;
1699 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1700 if(isContinuation(SecondCE)) {
1701 src->lh[src->resultLen].baseContCE = SecondCE;
1702 } else {
1703 src->lh[src->resultLen].baseContCE = 0;
1704 }
1705 src->lh[src->resultLen].nextCE = 0;
1706 src->lh[src->resultLen].nextContCE = 0;
1707 src->lh[src->resultLen].previousCE = 0;
1708 src->lh[src->resultLen].previousContCE = 0;
1709
1710 src->lh[src->resultLen].indirect = FALSE;
1711
1712 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1713 //}
1714 }
1715
1716 return sourceToken;
1717
1718 }
1719
1720 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1721 UColToken *lastToken = NULL;
1722 const UChar *parseEnd = NULL;
1723 uint32_t expandNext = 0;
1724 UBool variableTop = FALSE;
1725 UBool top = FALSE;
1726 uint16_t specs = 0;
1727 UColTokListHeader *ListList = NULL;
1728
1729 src->parsedToken.strength = UCOL_TOK_UNSET;
1730
1731 ListList = src->lh;
1732
1733 if(U_FAILURE(*status)) {
1734 return 0;
1735 }
1736 #ifdef DEBUG_FOR_CODE_POINTS
1737 char filename[35];
1738 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
1739 dfcp_fp = fopen(filename, "a");
1740 fprintf(stdout, "Output is in the file %s.\n", filename);
1741 #endif
1742
1743 #ifdef DEBUG_FOR_COLL_RULES
1744 std::string s3;
1745 UnicodeString(src->source).toUTF8String(s3);
1746 std::cout << "src->source = " << s3 << std::endl;
1747 #endif
1748
1749 while(src->current < src->end || src->isStarred) {
1750 src->parsedToken.prefixOffset = 0;
1751
1752 parseEnd = ucol_tok_parseNextToken(src,
1753 (UBool)(lastToken == NULL),
1754 parseError,
1755 status);
1756
1757 specs = src->parsedToken.flags;
1758
1759
1760 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1761 top = ((specs & UCOL_TOK_TOP) != 0);
1762
1763 if(U_SUCCESS(*status) && parseEnd != NULL) {
1764 UColToken *sourceToken = NULL;
1765 //uint32_t key = 0;
1766 uint32_t lastStrength = UCOL_TOK_UNSET;
1767
1768 if(lastToken != NULL ) {
1769 lastStrength = lastToken->strength;
1770 }
1771
1772 #ifdef DEBUG_FOR_CODE_POINTS
1773 UChar32 cp;
1774 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
1775 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
1776 #endif
1777 //key = newCharsLen << 24 | charsOffset;
1778 UColToken key;
1779 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1780 key.rulesToParseHdl = &(src->source);
1781
1782 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
1783 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1784
1785 if(src->parsedToken.strength != UCOL_TOK_RESET) {
1786 if(lastToken == NULL) { /* this means that rules haven't started properly */
1787 *status = U_INVALID_FORMAT_ERROR;
1788 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1789 DBG_FORMAT_ERROR
1790 return 0;
1791 }
1792 /* 6 Otherwise (when relation != reset) */
1793 if(sourceToken == NULL) {
1794 /* If sourceToken is null, create new one, */
1795 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1796 /* test for NULL */
1797 if (sourceToken == NULL) {
1798 *status = U_MEMORY_ALLOCATION_ERROR;
1799 return 0;
1800 }
1801 sourceToken->rulesToParseHdl = &(src->source);
1802 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1803
1804 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1805
1806 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1807 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1808
1809 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1810 sourceToken->next = NULL;
1811 sourceToken->previous = NULL;
1812 sourceToken->noOfCEs = 0;
1813 sourceToken->noOfExpCEs = 0;
1814 // keep the flags around so that we know about before
1815 sourceToken->flags = src->parsedToken.flags;
1816 uhash_put(src->tailored, sourceToken, sourceToken, status);
1817 if(U_FAILURE(*status)) {
1818 return 0;
1819 }
1820 } else {
1821 /* we could have fished out a reset here */
1822 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1823 /* otherwise remove sourceToken from where it was. */
1824 if(sourceToken->next != NULL) {
1825 if(sourceToken->next->strength > sourceToken->strength) {
1826 sourceToken->next->strength = sourceToken->strength;
1827 }
1828 sourceToken->next->previous = sourceToken->previous;
1829 } else {
1830 sourceToken->listHeader->last = sourceToken->previous;
1831 }
1832
1833 if(sourceToken->previous != NULL) {
1834 sourceToken->previous->next = sourceToken->next;
1835 } else {
1836 sourceToken->listHeader->first = sourceToken->next;
1837 }
1838 sourceToken->next = NULL;
1839 sourceToken->previous = NULL;
1840 }
1841 }
1842
1843 sourceToken->strength = src->parsedToken.strength;
1844 sourceToken->listHeader = lastToken->listHeader;
1845
1846 /*
1847 1. Find the strongest strength in each list, and set strongestP and strongestN
1848 accordingly in the headers.
1849 */
1850 if(lastStrength == UCOL_TOK_RESET
1851 || sourceToken->listHeader->first == 0) {
1852 /* If LAST is a reset
1853 insert sourceToken in the list. */
1854 if(sourceToken->listHeader->first == 0) {
1855 sourceToken->listHeader->first = sourceToken;
1856 sourceToken->listHeader->last = sourceToken;
1857 } else { /* we need to find a place for us */
1858 /* and we'll get in front of the same strength */
1859 if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1860 sourceToken->next = sourceToken->listHeader->first;
1861 sourceToken->next->previous = sourceToken;
1862 sourceToken->listHeader->first = sourceToken;
1863 sourceToken->previous = NULL;
1864 } else {
1865 lastToken = sourceToken->listHeader->first;
1866 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1867 lastToken = lastToken->next;
1868 }
1869 if(lastToken->next != NULL) {
1870 lastToken->next->previous = sourceToken;
1871 } else {
1872 sourceToken->listHeader->last = sourceToken;
1873 }
1874 sourceToken->previous = lastToken;
1875 sourceToken->next = lastToken->next;
1876 lastToken->next = sourceToken;
1877 }
1878 }
1879 } else {
1880 /* Otherwise (when LAST is not a reset)
1881 if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1882 otherwise insert before.
1883 when inserting after or before, search to the next position with the same
1884 strength in that direction. (This is called postpone insertion). */
1885 if(sourceToken != lastToken) {
1886 if(lastToken->polarity == sourceToken->polarity) {
1887 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1888 lastToken = lastToken->next;
1889 }
1890 sourceToken->previous = lastToken;
1891 if(lastToken->next != NULL) {
1892 lastToken->next->previous = sourceToken;
1893 } else {
1894 sourceToken->listHeader->last = sourceToken;
1895 }
1896
1897 sourceToken->next = lastToken->next;
1898 lastToken->next = sourceToken;
1899 } else {
1900 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1901 lastToken = lastToken->previous;
1902 }
1903 sourceToken->next = lastToken;
1904 if(lastToken->previous != NULL) {
1905 lastToken->previous->next = sourceToken;
1906 } else {
1907 sourceToken->listHeader->first = sourceToken;
1908 }
1909 sourceToken->previous = lastToken->previous;
1910 lastToken->previous = sourceToken;
1911 }
1912 } else { /* repeated one thing twice in rules, stay with the stronger strength */
1913 if(lastStrength < sourceToken->strength) {
1914 sourceToken->strength = lastStrength;
1915 }
1916 }
1917 }
1918
1919 /* if the token was a variable top, we're gonna put it in */
1920 if(variableTop == TRUE && src->varTop == NULL) {
1921 variableTop = FALSE;
1922 src->varTop = sourceToken;
1923 }
1924
1925 // Treat the expansions.
1926 // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1927 // (&abc * d * e <=> &ab * d / c * e / c)
1928 // if both of them are in effect for a token, they are combined.
1929
1930 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1931
1932 if(expandNext != 0) {
1933 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1934 expandNext = 0;
1935 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1936 sourceToken->expansion = expandNext;
1937 } else { /* there is both explicit and implicit expansion. We need to make a combination */
1938 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1939 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1940 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1941 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1942 }
1943 }
1944
1945 // This is just for debugging purposes
1946 if(sourceToken->expansion != 0) {
1947 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1948 } else {
1949 sourceToken->debugExpansion = 0;
1950 }
1951 // if the previous token was a reset before, the strength of this
1952 // token must match the strength of before. Otherwise we have an
1953 // undefined situation.
1954 // In other words, we currently have a cludge which we use to
1955 // represent &a >> x. This is written as &[before 2]a << x.
1956 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1957 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1958 if(beforeStrength != sourceToken->strength) {
1959 *status = U_INVALID_FORMAT_ERROR;
1960 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1961 DBG_FORMAT_ERROR
1962 return 0;
1963 }
1964 }
1965 } else {
1966 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1967 /* if the previous token was also a reset, */
1968 /*this means that we have two consecutive resets */
1969 /* and we want to remove the previous one if empty*/
1970 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1971 src->resultLen--;
1972 }
1973 }
1974
1975 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1976 uint32_t searchCharsLen = src->parsedToken.charsLen;
1977 while(searchCharsLen > 1 && sourceToken == NULL) {
1978 searchCharsLen--;
1979 //key = searchCharsLen << 24 | charsOffset;
1980 UColToken key;
1981 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1982 key.rulesToParseHdl = &(src->source);
1983 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1984 }
1985 if(sourceToken != NULL) {
1986 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1987 }
1988 }
1989
1990 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1991 if(top == FALSE) { /* there is no indirection */
1992 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1993 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1994 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1995 while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1996 sourceToken = sourceToken->previous;
1997 }
1998 /* here, either we hit the strength or NULL */
1999 if(sourceToken->strength == strength) {
2000 if(sourceToken->previous != NULL) {
2001 sourceToken = sourceToken->previous;
2002 } else { /* start of list */
2003 sourceToken = sourceToken->listHeader->reset;
2004 }
2005 } else { /* we hit NULL */
2006 /* we should be doing the else part */
2007 sourceToken = sourceToken->listHeader->reset;
2008 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2009 }
2010 } else {
2011 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2012 }
2013 } else { /* this is both before and indirection */
2014 top = FALSE;
2015 ListList[src->resultLen].previousCE = 0;
2016 ListList[src->resultLen].previousContCE = 0;
2017 ListList[src->resultLen].indirect = TRUE;
2018 /* we need to do slightly more work. we need to get the baseCE using the */
2019 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
2020 /* in ucol_bld */
2021 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
2022 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2023 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
2024 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2025
2026 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2027 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&
2028 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
2029 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
2030 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
2031 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
2032 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
2033 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
2034 } else {
2035 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
2036 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
2037 }
2038
2039 ListList[src->resultLen].baseCE = CE;
2040 ListList[src->resultLen].baseContCE = SecondCE;
2041 ListList[src->resultLen].nextCE = 0;
2042 ListList[src->resultLen].nextContCE = 0;
2043
2044 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2045 }
2046 }
2047
2048
2049 /* 5 If the relation is a reset:
2050 If sourceToken is null
2051 Create new list, create new sourceToken, make the baseCE from source, put
2052 the sourceToken in ListHeader of the new list */
2053 if(sourceToken == NULL) {
2054 /*
2055 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
2056 First convert all expansions into normal form. Examples:
2057 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
2058 d * ... into &x * c/y * d * ...
2059 Note: reset values can never have expansions, although they can cause the
2060 very next item to have one. They may be contractions, if they are found
2061 earlier in the list.
2062 */
2063 if(top == FALSE) {
2064 collIterate s;
2065 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2066
2067 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
2068
2069 CE = ucol_getNextCE(src->UCA, &s, status);
2070 const UChar *expand = s.pos;
2071 SecondCE = ucol_getNextCE(src->UCA, &s, status);
2072
2073 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
2074 if(isContinuation(SecondCE)) {
2075 ListList[src->resultLen].baseContCE = SecondCE;
2076 } else {
2077 ListList[src->resultLen].baseContCE = 0;
2078 }
2079 ListList[src->resultLen].nextCE = 0;
2080 ListList[src->resultLen].nextContCE = 0;
2081 ListList[src->resultLen].previousCE = 0;
2082 ListList[src->resultLen].previousContCE = 0;
2083 ListList[src->resultLen].indirect = FALSE;
2084 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
2085 } else { /* top == TRUE */
2086 /* just use the supplied values */
2087 top = FALSE;
2088 ListList[src->resultLen].previousCE = 0;
2089 ListList[src->resultLen].previousContCE = 0;
2090 ListList[src->resultLen].indirect = TRUE;
2091 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2092 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
2093 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
2094 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
2095
2096 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2097
2098 }
2099 } else { /* reset to something already in rules */
2100 top = FALSE;
2101 }
2102 }
2103 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
2104 lastToken = sourceToken;
2105 } else {
2106 if(U_FAILURE(*status)) {
2107 return 0;
2108 }
2109 }
2110 }
2111 #ifdef DEBUG_FOR_CODE_POINTS
2112 fclose(dfcp_fp);
2113 #endif
2114
2115
2116 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
2117 src->resultLen--;
2118 }
2119 return src->resultLen;
2120 }
2121
2122 const UChar* ucol_tok_getRulesFromBundle(
2123 void* /*context*/,
2124 const char* locale,
2125 const char* type,
2126 int32_t* pLength,
2127 UErrorCode* status)
2128 {
2129 const UChar* rules = NULL;
2130 UResourceBundle* bundle;
2131 UResourceBundle* collations;
2132 UResourceBundle* collation;
2133
2134 *pLength = 0;
2135
2136 bundle = ures_open(U_ICUDATA_COLL, locale, status);
2137 if(U_SUCCESS(*status)){
2138 collations = ures_getByKey(bundle, "collations", NULL, status);
2139 if(U_SUCCESS(*status)){
2140 collation = ures_getByKey(collations, type, NULL, status);
2141 if(U_SUCCESS(*status)){
2142 rules = ures_getStringByKey(collation, "Sequence", pLength, status);
2143 if(U_FAILURE(*status)){
2144 *pLength = 0;
2145 rules = NULL;
2146 }
2147 ures_close(collation);
2148 }
2149 ures_close(collations);
2150 }
2151 }
2152
2153 ures_close(bundle);
2154
2155 return rules;
2156 }
2157
2158 void ucol_tok_initTokenList(
2159 UColTokenParser *src,
2160 const UChar *rules,
2161 uint32_t rulesLength,
2162 const UCollator *UCA,
2163 GetCollationRulesFunction importFunc,
2164 void* context,
2165 UErrorCode *status) {
2166 U_NAMESPACE_USE
2167
2168 uint32_t nSize = 0;
2169 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
2170
2171 bool needToDeallocRules = false;
2172
2173 if(U_FAILURE(*status)) {
2174 return;
2175 }
2176
2177 // set everything to zero, so that we can clean up gracefully
2178 uprv_memset(src, 0, sizeof(UColTokenParser));
2179
2180 // first we need to find options that don't like to be normalized,
2181 // like copy and remove...
2182 //const UChar *openBrace = rules;
2183 int32_t optionNumber = -1;
2184 const UChar *setStart = NULL;
2185 uint32_t i = 0;
2186 while(i < rulesLength) {
2187 if(rules[i] == 0x005B) { // '[': start of an option
2188 /* Gets the following:
2189 optionNumber: The index of the option.
2190 setStart: The pointer at which the option arguments start.
2191 */
2192 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
2193
2194 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
2195 // [optimize]
2196 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2197 if(U_SUCCESS(*status)) {
2198 if(src->copySet == NULL) {
2199 src->copySet = newSet;
2200 } else {
2201 uset_addAll(src->copySet, newSet);
2202 uset_close(newSet);
2203 }
2204 } else {
2205 return;
2206 }
2207 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
2208 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2209 if(U_SUCCESS(*status)) {
2210 if(src->removeSet == NULL) {
2211 src->removeSet = newSet;
2212 } else {
2213 uset_addAll(src->removeSet, newSet);
2214 uset_close(newSet);
2215 }
2216 } else {
2217 return;
2218 }
2219 } else if(optionNumber == OPTION_IMPORT){
2220 // [import <collation-name>]
2221
2222 // Find the address of the closing ].
2223 UChar* import_end = u_strchr(setStart, 0x005D);
2224 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
2225 // Ignore trailing whitespace.
2226 while(uprv_isRuleWhiteSpace(*(import_end-1))) {
2227 --import_end;
2228 }
2229
2230 int32_t optionLength = (int32_t)(import_end - setStart);
2231 char option[50];
2232 if(optionLength >= (int32_t)sizeof(option)) {
2233 *status = U_ILLEGAL_ARGUMENT_ERROR;
2234 return;
2235 }
2236 u_UCharsToChars(setStart, option, optionLength);
2237 option[optionLength] = 0;
2238
2239 *status = U_ZERO_ERROR;
2240 char locale[50];
2241 int32_t templ;
2242 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
2243 if(U_FAILURE(*status)) {
2244 *status = U_ILLEGAL_ARGUMENT_ERROR;
2245 return;
2246 }
2247
2248 char type[50];
2249 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
2250 U_FAILURE(*status)
2251 ) {
2252 *status = U_ZERO_ERROR;
2253 uprv_strcpy(type, "standard");
2254 }
2255
2256 // TODO: Use public functions when available, see ticket #8134.
2257 char *keywords = (char *)locale_getKeywordsStart(locale);
2258 if(keywords != NULL) {
2259 *keywords = 0;
2260 }
2261
2262 int32_t importRulesLength = 0;
2263 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
2264
2265 #ifdef DEBUG_FOR_COLL_RULES
2266 std::string s;
2267 UnicodeString(importRules).toUTF8String(s);
2268 std::cout << "Import rules = " << s << std::endl;
2269 #endif
2270
2271 // Add the length of the imported rules to length of the original rules,
2272 // and subtract the length of the import option.
2273 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
2274
2275 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
2276
2277 #ifdef DEBUG_FOR_COLL_RULES
2278 std::string s1;
2279 UnicodeString(rules).toUTF8String(s1);
2280 std::cout << "Original rules = " << s1 << std::endl;
2281 #endif
2282
2283
2284 // Copy the section of the original rules leading up to the import
2285 uprv_memcpy(newRules, rules, i*sizeof(UChar));
2286 // Copy the imported rules
2287 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
2288 // Copy the rest of the original rules (minus the import option itself)
2289 uprv_memcpy(newRules+i+importRulesLength,
2290 rules+optionEndOffset,
2291 (rulesLength-optionEndOffset)*sizeof(UChar));
2292
2293 #ifdef DEBUG_FOR_COLL_RULES
2294 std::string s2;
2295 UnicodeString(newRules).toUTF8String(s2);
2296 std::cout << "Resulting rules = " << s2 << std::endl;
2297 #endif
2298
2299 if(needToDeallocRules){
2300 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2301 uprv_free((void*)rules);
2302 }
2303 needToDeallocRules = true;
2304 rules = newRules;
2305 rulesLength = newRulesLength;
2306
2307 estimatedSize += importRulesLength*2;
2308
2309 // First character of the new rules needs to be processed
2310 i--;
2311 }
2312 }
2313 //openBrace++;
2314 i++;
2315 }
2316
2317 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
2318 /* test for NULL */
2319 if (src->source == NULL) {
2320 *status = U_MEMORY_ALLOCATION_ERROR;
2321 return;
2322 }
2323 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
2324 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
2325 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
2326 *status = U_ZERO_ERROR;
2327 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
2328 /* test for NULL */
2329 if (src->source == NULL) {
2330 *status = U_MEMORY_ALLOCATION_ERROR;
2331 return;
2332 }
2333 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
2334 }
2335 if(needToDeallocRules){
2336 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2337 uprv_free((void*)rules);
2338 }
2339
2340
2341 src->current = src->source;
2342 src->end = src->source+nSize;
2343 src->sourceCurrent = src->source;
2344 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
2345 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
2346 src->varTop = NULL;
2347 src->UCA = UCA;
2348 src->invUCA = ucol_initInverseUCA(status);
2349 src->parsedToken.charsLen = 0;
2350 src->parsedToken.charsOffset = 0;
2351 src->parsedToken.extensionLen = 0;
2352 src->parsedToken.extensionOffset = 0;
2353 src->parsedToken.prefixLen = 0;
2354 src->parsedToken.prefixOffset = 0;
2355 src->parsedToken.flags = 0;
2356 src->parsedToken.strength = UCOL_TOK_UNSET;
2357 src->buildCCTabFlag = FALSE;
2358 src->isStarred = FALSE;
2359 src->inRange = FALSE;
2360 src->lastRangeCp = 0;
2361 src->previousCp = 0;
2362
2363 if(U_FAILURE(*status)) {
2364 return;
2365 }
2366 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
2367 if(U_FAILURE(*status)) {
2368 return;
2369 }
2370 uhash_setValueDeleter(src->tailored, uhash_freeBlock);
2371
2372 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
2373 /* test for NULL */
2374 if (src->opts == NULL) {
2375 *status = U_MEMORY_ALLOCATION_ERROR;
2376 return;
2377 }
2378
2379 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
2380
2381 src->lh = 0;
2382 src->listCapacity = 1024;
2383 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
2384 //Test for NULL
2385 if (src->lh == NULL) {
2386 *status = U_MEMORY_ALLOCATION_ERROR;
2387 return;
2388 }
2389 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
2390 src->resultLen = 0;
2391
2392 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2393
2394 // UCOL_RESET_TOP_VALUE
2395 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2396 // UCOL_FIRST_PRIMARY_IGNORABLE
2397 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
2398 // UCOL_LAST_PRIMARY_IGNORABLE
2399 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
2400 // UCOL_FIRST_SECONDARY_IGNORABLE
2401 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
2402 // UCOL_LAST_SECONDARY_IGNORABLE
2403 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
2404 // UCOL_FIRST_TERTIARY_IGNORABLE
2405 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
2406 // UCOL_LAST_TERTIARY_IGNORABLE
2407 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
2408 // UCOL_FIRST_VARIABLE
2409 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
2410 // UCOL_LAST_VARIABLE
2411 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
2412 // UCOL_FIRST_NON_VARIABLE
2413 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
2414 // UCOL_LAST_NON_VARIABLE
2415 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2416 // UCOL_FIRST_IMPLICIT
2417 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
2418 // UCOL_LAST_IMPLICIT
2419 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
2420 // UCOL_FIRST_TRAILING
2421 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
2422 // UCOL_LAST_TRAILING
2423 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
2424 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
2425 }
2426
2427
2428 void ucol_tok_closeTokenList(UColTokenParser *src) {
2429 if(src->copySet != NULL) {
2430 uset_close(src->copySet);
2431 }
2432 if(src->removeSet != NULL) {
2433 uset_close(src->removeSet);
2434 }
2435 if(src->tailored != NULL) {
2436 uhash_close(src->tailored);
2437 }
2438 if(src->lh != NULL) {
2439 uprv_free(src->lh);
2440 }
2441 if(src->source != NULL) {
2442 uprv_free(src->source);
2443 }
2444 if(src->opts != NULL) {
2445 uprv_free(src->opts);
2446 }
2447 if (src->reorderCodes != NULL) {
2448 uprv_free(src->reorderCodes);
2449 }
2450 }
2451
2452 #endif /* #if !UCONFIG_NO_COLLATION */