]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol_tok.cpp
ICU-6.2.14.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_tok.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
374ca955 4* Copyright (C) 2001-2004, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucol_tok.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created 02/22/2001
14* created by: Vladimir Weinstein
15*
374ca955 16* This module reads a tailoring rule string and produces a list of
b75a7d8f 17* tokens that will be turned into collation elements
374ca955 18*
b75a7d8f
A
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "unicode/ustring.h"
26#include "unicode/uchar.h"
27#include "unicode/uniset.h"
374ca955 28
b75a7d8f
A
29#include "ucol_tok.h"
30#include "cmemory.h"
374ca955 31#include "util.h"
b75a7d8f
A
32
33U_CDECL_BEGIN
34static int32_t U_EXPORT2 U_CALLCONV
35uhash_hashTokens(const UHashTok k)
36{
37 int32_t hash = 0;
38 //uint32_t key = (uint32_t)k.integer;
39 UColToken *key = (UColToken *)k.pointer;
40 if (key != 0) {
41 //int32_t len = (key & 0xFF000000)>>24;
42 int32_t len = (key->source & 0xFF000000)>>24;
43 int32_t inc = ((len - 32) / 32) + 1;
374ca955 44
b75a7d8f
A
45 //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
46 const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
374ca955 47 const UChar *limit = p + len;
b75a7d8f
A
48
49 while (p<limit) {
50 hash = (hash * 37) + *p;
51 p += inc;
52 }
53 }
54 return hash;
55}
56
57static UBool U_EXPORT2 U_CALLCONV
58uhash_compareTokens(const UHashTok key1, const UHashTok key2)
59{
60 //uint32_t p1 = (uint32_t) key1.integer;
61 //uint32_t p2 = (uint32_t) key2.integer;
62 UColToken *p1 = (UColToken *)key1.pointer;
63 UColToken *p2 = (UColToken *)key2.pointer;
64 const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
65 const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
66 uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
67 uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
68 const UChar *end = s1+s1L-1;
69
70 if (p1 == p2) {
71 return TRUE;
72 }
73 if (p1->source == 0 || p2->source == 0) {
74 return FALSE;
75 }
76 if(s1L != s2L) {
77 return FALSE;
78 }
79 if(p1->source == p2->source) {
80 return TRUE;
81 }
82 while((s1 < end) && *s1 == *s2) {
83 ++s1;
84 ++s2;
85 }
86 if(*s1 == *s2) {
87 return TRUE;
88 } else {
89 return FALSE;
90 }
91}
92U_CDECL_END
93
94static inline void U_CALLCONV
95uhash_freeBlockWrapper(void *obj) {
96 uhash_freeBlock(obj);
97}
98
99
100typedef struct {
101 uint32_t startCE;
102 uint32_t startContCE;
103 uint32_t limitCE;
104 uint32_t limitContCE;
105} indirectBoundaries;
106
107/* these values are used for finding CE values for indirect positioning. */
108/* Indirect positioning is a mechanism for allowing resets on symbolic */
109/* values. It only works for resets and you cannot tailor indirect names */
110/* An indirect name can define either an anchor point or a range. An */
111/* anchor point behaves in exactly the same way as a code point in reset */
112/* would, except that it cannot be tailored. A range (we currently only */
113/* know for the [top] range will explicitly set the upper bound for */
114/* generated CEs, thus allowing for better control over how many CEs can */
115/* be squeezed between in the range without performance penalty. */
116/* In that respect, we use [top] for tailoring of locales that use CJK */
117/* characters. Other indirect values are currently a pure convenience, */
118/* they can be used to assure that the CEs will be always positioned in */
119/* the same place relative to a point with known properties (e.g. first */
120/* primary ignorable). */
121static indirectBoundaries ucolIndirectBoundaries[15];
122/*
123static indirectBoundaries ucolIndirectBoundaries[11] = {
374ca955 124 { UCOL_RESET_TOP_VALUE, 0,
b75a7d8f 125 UCOL_NEXT_TOP_VALUE, 0 },
374ca955 126 { UCOL_FIRST_PRIMARY_IGNORABLE, 0,
b75a7d8f 127 0, 0 },
374ca955 128 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
b75a7d8f 129 0, 0 },
374ca955 130 { UCOL_FIRST_SECONDARY_IGNORABLE, 0,
b75a7d8f 131 0, 0 },
374ca955 132 { UCOL_LAST_SECONDARY_IGNORABLE, 0,
b75a7d8f 133 0, 0 },
374ca955 134 { UCOL_FIRST_TERTIARY_IGNORABLE, 0,
b75a7d8f 135 0, 0 },
374ca955 136 { UCOL_LAST_TERTIARY_IGNORABLE, 0,
b75a7d8f 137 0, 0 },
374ca955 138 { UCOL_FIRST_VARIABLE, 0,
b75a7d8f 139 0, 0 },
374ca955 140 { UCOL_LAST_VARIABLE, 0,
b75a7d8f 141 0, 0 },
374ca955 142 { UCOL_FIRST_NON_VARIABLE, 0,
b75a7d8f 143 0, 0 },
374ca955 144 { UCOL_LAST_NON_VARIABLE, 0,
b75a7d8f
A
145 0, 0 },
146};
147*/
148
374ca955
A
149static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
150
b75a7d8f
A
151 // Set values for the top - TODO: once we have values for all the indirects, we are going
152 // to initalize here.
153 ucolIndirectBoundaries[indexR].startCE = start[0];
154 ucolIndirectBoundaries[indexR].startContCE = start[1];
155 if(end) {
156 ucolIndirectBoundaries[indexR].limitCE = end[0];
157 ucolIndirectBoundaries[indexR].limitContCE = end[1];
158 } else {
159 ucolIndirectBoundaries[indexR].limitCE = 0;
160 ucolIndirectBoundaries[indexR].limitContCE = 0;
161 }
162}
163
164
374ca955
A
165static inline
166void syntaxError(const UChar* rules,
b75a7d8f
A
167 int32_t pos,
168 int32_t rulesLen,
169 UParseError* parseError) {
170 parseError->offset = pos;
171 parseError->line = 0 ; /* we are not using line numbers */
374ca955 172
b75a7d8f
A
173 // for pre-context
174 int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
175 int32_t stop = pos;
374ca955 176
b75a7d8f
A
177 u_memcpy(parseError->preContext,rules+start,stop-start);
178 //null terminate the buffer
179 parseError->preContext[stop-start] = 0;
374ca955 180
b75a7d8f
A
181 //for post-context
182 start = pos+1;
374ca955
A
183 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
184 rulesLen;
b75a7d8f 185
374ca955
A
186 if(start < stop) {
187 u_memcpy(parseError->postContext,rules+start,stop-start);
188 //null terminate the buffer
189 parseError->postContext[stop-start]= 0;
190 } else {
191 parseError->postContext[0] = 0;
192 }
b75a7d8f
A
193}
194
195static
196void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
197 switch(attrib) {
198 case UCOL_HIRAGANA_QUATERNARY_MODE:
199 opts->hiraganaQ = value;
200 break;
201 case UCOL_FRENCH_COLLATION:
202 opts->frenchCollation = value;
203 break;
204 case UCOL_ALTERNATE_HANDLING:
205 opts->alternateHandling = value;
206 break;
207 case UCOL_CASE_FIRST:
208 opts->caseFirst = value;
209 break;
210 case UCOL_CASE_LEVEL:
211 opts->caseLevel = value;
212 break;
213 case UCOL_NORMALIZATION_MODE:
214 opts->normalizationMode = value;
215 break;
216 case UCOL_STRENGTH:
217 opts->strength = value;
218 break;
219 case UCOL_NUMERIC_COLLATION:
374ca955
A
220 opts->numericCollation = value;
221 break;
b75a7d8f
A
222 case UCOL_ATTRIBUTE_COUNT:
223 default:
224 break;
225 }
226}
227
228#define UTOK_OPTION_COUNT 20
229
230static UBool didInit = FALSE;
231/* we can be strict, or we can be lenient */
232/* I'd surely be lenient with the option arguments */
233/* maybe even with options */
234U_STRING_DECL(suboption_00, "non-ignorable", 13);
235U_STRING_DECL(suboption_01, "shifted", 7);
236
237U_STRING_DECL(suboption_02, "lower", 5);
238U_STRING_DECL(suboption_03, "upper", 5);
239U_STRING_DECL(suboption_04, "off", 3);
240U_STRING_DECL(suboption_05, "on", 2);
241U_STRING_DECL(suboption_06, "1", 1);
242U_STRING_DECL(suboption_07, "2", 1);
243U_STRING_DECL(suboption_08, "3", 1);
244U_STRING_DECL(suboption_09, "4", 1);
245U_STRING_DECL(suboption_10, "I", 1);
246
247U_STRING_DECL(suboption_11, "primary", 7);
248U_STRING_DECL(suboption_12, "secondary", 9);
249U_STRING_DECL(suboption_13, "tertiary", 8);
250U_STRING_DECL(suboption_14, "variable", 8);
251U_STRING_DECL(suboption_15, "regular", 7);
252U_STRING_DECL(suboption_16, "implicit", 8);
253U_STRING_DECL(suboption_17, "trailing", 8);
254
255
256U_STRING_DECL(option_00, "undefined", 9);
374ca955 257U_STRING_DECL(option_01, "rearrange", 9);
b75a7d8f 258U_STRING_DECL(option_02, "alternate", 9);
374ca955
A
259U_STRING_DECL(option_03, "backwards", 9);
260U_STRING_DECL(option_04, "variable top", 12);
261U_STRING_DECL(option_05, "top", 3);
262U_STRING_DECL(option_06, "normalization", 13);
263U_STRING_DECL(option_07, "caseLevel", 9);
264U_STRING_DECL(option_08, "caseFirst", 9);
265U_STRING_DECL(option_09, "scriptOrder", 11);
266U_STRING_DECL(option_10, "charsetname", 11);
267U_STRING_DECL(option_11, "charset", 7);
268U_STRING_DECL(option_12, "before", 6);
b75a7d8f
A
269U_STRING_DECL(option_13, "hiraganaQ", 9);
270U_STRING_DECL(option_14, "strength", 8);
271U_STRING_DECL(option_15, "first", 5);
272U_STRING_DECL(option_16, "last", 4);
273U_STRING_DECL(option_17, "optimize", 8);
274U_STRING_DECL(option_18, "suppressContractions", 20);
374ca955 275U_STRING_DECL(option_19, "numericOrdering", 15);
b75a7d8f
A
276
277
278/*
374ca955
A
279[last variable] last variable value
280[last primary ignorable] largest CE for primary ignorable
281[last secondary ignorable] largest CE for secondary ignorable
282[last tertiary ignorable] largest CE for tertiary ignorable
283[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
b75a7d8f
A
284*/
285
286
287static const ucolTokSuboption alternateSub[2] = {
288 {suboption_00, 13, UCOL_NON_IGNORABLE},
289 {suboption_01, 7, UCOL_SHIFTED}
290};
291
292static const ucolTokSuboption caseFirstSub[3] = {
293 {suboption_02, 5, UCOL_LOWER_FIRST},
294 {suboption_03, 5, UCOL_UPPER_FIRST},
295 {suboption_04, 3, UCOL_OFF},
296};
297
298static const ucolTokSuboption onOffSub[2] = {
299 {suboption_04, 3, UCOL_OFF},
300 {suboption_05, 2, UCOL_ON}
301};
302
303static const ucolTokSuboption frenchSub[1] = {
304 {suboption_07, 1, UCOL_ON}
305};
306
307static const ucolTokSuboption beforeSub[3] = {
308 {suboption_06, 1, UCOL_PRIMARY},
309 {suboption_07, 1, UCOL_SECONDARY},
310 {suboption_08, 1, UCOL_TERTIARY}
311};
312
313static const ucolTokSuboption strengthSub[5] = {
314 {suboption_06, 1, UCOL_PRIMARY},
315 {suboption_07, 1, UCOL_SECONDARY},
316 {suboption_08, 1, UCOL_TERTIARY},
317 {suboption_09, 1, UCOL_QUATERNARY},
318 {suboption_10, 1, UCOL_IDENTICAL},
319};
320
321static const ucolTokSuboption firstLastSub[7] = {
322 {suboption_11, 7, UCOL_PRIMARY},
323 {suboption_12, 9, UCOL_PRIMARY},
324 {suboption_13, 8, UCOL_PRIMARY},
325 {suboption_14, 8, UCOL_PRIMARY},
326 {suboption_15, 7, UCOL_PRIMARY},
327 {suboption_16, 8, UCOL_PRIMARY},
328 {suboption_17, 8, UCOL_PRIMARY},
329};
330
331enum OptionNumber {
332 OPTION_ALTERNATE_HANDLING = 0,
333 OPTION_FRENCH_COLLATION,
334 OPTION_CASE_LEVEL,
335 OPTION_CASE_FIRST,
336 OPTION_NORMALIZATION_MODE,
337 OPTION_HIRAGANA_QUATERNARY,
338 OPTION_STRENGTH,
339 OPTION_NUMERIC_COLLATION,
340 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
341 OPTION_VARIABLE_TOP,
342 OPTION_REARRANGE,
343 OPTION_BEFORE,
344 OPTION_TOP,
345 OPTION_FIRST,
346 OPTION_LAST,
347 OPTION_OPTIMIZE,
348 OPTION_SUPPRESS_CONTRACTIONS,
349 OPTION_UNDEFINED,
350 OPTION_SCRIPT_ORDER,
351 OPTION_CHARSET_NAME,
352 OPTION_CHARSET
353} ;
354
355static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
356 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
357 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */
358 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */
359 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */
360 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
361 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
362 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
374ca955 363 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/
b75a7d8f
A
364 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
365 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
366 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
367 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
368 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
369 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
370 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
371 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
372 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
373 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
374 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
375 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */
376};
377
378static
374ca955
A
379int32_t u_strncmpNoCase(const UChar *s1,
380 const UChar *s2,
381 int32_t n)
b75a7d8f
A
382{
383 if(n > 0) {
384 int32_t rc;
385 for(;;) {
386 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
387 if(rc != 0 || *s1 == 0 || --n == 0) {
388 return rc;
389 }
390 ++s1;
391 ++s2;
392 }
393 }
394 return 0;
395}
396
397static
398void ucol_uprv_tok_initData() {
399 if(!didInit) {
400 U_STRING_INIT(suboption_00, "non-ignorable", 13);
401 U_STRING_INIT(suboption_01, "shifted", 7);
402
403 U_STRING_INIT(suboption_02, "lower", 5);
404 U_STRING_INIT(suboption_03, "upper", 5);
405 U_STRING_INIT(suboption_04, "off", 3);
406 U_STRING_INIT(suboption_05, "on", 2);
407
408 U_STRING_INIT(suboption_06, "1", 1);
409 U_STRING_INIT(suboption_07, "2", 1);
410 U_STRING_INIT(suboption_08, "3", 1);
411 U_STRING_INIT(suboption_09, "4", 1);
412 U_STRING_INIT(suboption_10, "I", 1);
413
414 U_STRING_INIT(suboption_11, "primary", 7);
415 U_STRING_INIT(suboption_12, "secondary", 9);
416 U_STRING_INIT(suboption_13, "tertiary", 8);
417 U_STRING_INIT(suboption_14, "variable", 8);
418 U_STRING_INIT(suboption_15, "regular", 7);
419 U_STRING_INIT(suboption_16, "implicit", 8);
420 U_STRING_INIT(suboption_17, "trailing", 8);
421
422
423 U_STRING_INIT(option_00, "undefined", 9);
374ca955 424 U_STRING_INIT(option_01, "rearrange", 9);
b75a7d8f 425 U_STRING_INIT(option_02, "alternate", 9);
374ca955
A
426 U_STRING_INIT(option_03, "backwards", 9);
427 U_STRING_INIT(option_04, "variable top", 12);
428 U_STRING_INIT(option_05, "top", 3);
429 U_STRING_INIT(option_06, "normalization", 13);
430 U_STRING_INIT(option_07, "caseLevel", 9);
431 U_STRING_INIT(option_08, "caseFirst", 9);
432 U_STRING_INIT(option_09, "scriptOrder", 11);
433 U_STRING_INIT(option_10, "charsetname", 11);
434 U_STRING_INIT(option_11, "charset", 7);
435 U_STRING_INIT(option_12, "before", 6);
b75a7d8f
A
436 U_STRING_INIT(option_13, "hiraganaQ", 9);
437 U_STRING_INIT(option_14, "strength", 8);
438 U_STRING_INIT(option_15, "first", 5);
439 U_STRING_INIT(option_16, "last", 4);
440 U_STRING_INIT(option_17, "optimize", 8);
441 U_STRING_INIT(option_18, "suppressContractions", 20);
374ca955 442 U_STRING_INIT(option_19, "numericOrdering", 15);
b75a7d8f
A
443 didInit = TRUE;
444 }
445}
446
447
448// This function reads basic options to set in the runtime collator
449// used by data driven tests. Should not support build time options
450U_CAPI const UChar * U_EXPORT2
374ca955
A
451ucol_tok_getNextArgument(const UChar *start, const UChar *end,
452 UColAttribute *attrib, UColAttributeValue *value,
b75a7d8f
A
453 UErrorCode *status) {
454 uint32_t i = 0;
455 int32_t j=0;
456 UBool foundOption = FALSE;
457 const UChar *optionArg = NULL;
458
459 ucol_uprv_tok_initData();
460
461 while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
462 start++;
463 }
464 if(start >= end) {
465 return NULL;
466 }
467 /* skip opening '[' */
468 if(*start == 0x005b) {
469 start++;
470 } else {
471 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
472 return NULL;
473 }
474
475 while(i < UTOK_OPTION_COUNT) {
476 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
477 foundOption = TRUE;
478 if(end - start > rulesOptions[i].optionLen) {
479 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
480 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
481 optionArg++;
482 }
374ca955 483 }
b75a7d8f
A
484 break;
485 }
486 i++;
487 }
488
489 if(!foundOption) {
490 *status = U_ILLEGAL_ARGUMENT_ERROR;
491 return NULL;
492 }
493
494 if(optionArg) {
495 for(j = 0; j<rulesOptions[i].subSize; j++) {
496 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
497 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
498 *attrib = rulesOptions[i].attr;
499 *value = rulesOptions[i].subopts[j].attrVal;
500 optionArg += rulesOptions[i].subopts[j].subLen;
501 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
502 optionArg++;
503 }
504 if(*optionArg == 0x005d) {
505 optionArg++;
506 return optionArg;
507 } else {
508 *status = U_ILLEGAL_ARGUMENT_ERROR;
509 return NULL;
510 }
511 }
512 }
513 }
514 *status = U_ILLEGAL_ARGUMENT_ERROR;
515 return NULL;
516}
517
374ca955 518static
b75a7d8f
A
519USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
520 while(*start != 0x005b) { /* advance while we find the first '[' */
521 start++;
522 }
374ca955 523 // now we need to get a balanced set of '[]'. The problem is that a set can have
b75a7d8f
A
524 // many, and *end point to the first closing '['
525 int32_t noOpenBraces = 1;
526 int32_t current = 1; // skip the opening brace
527 while(start+current < end && noOpenBraces != 0) {
528 if(start[current] == 0x005b) {
529 noOpenBraces++;
530 } else if(start[current] == 0x005D) { // closing brace
531 noOpenBraces--;
532 }
533 current++;
534 }
535 UChar *nextBrace = NULL;
536
537 if(noOpenBraces != 0 || (nextBrace = u_strchr(start+current, 0x005d /*']'*/)) == NULL) {
538 *status = U_ILLEGAL_ARGUMENT_ERROR;
539 return NULL;
540 }
541 return uset_openPattern(start, current, status);
542}
543
544static
545int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
546 int32_t i = 0;
374ca955
A
547 ucol_uprv_tok_initData();
548
b75a7d8f
A
549 while(u_isWhitespace(*start)) { /* eat whitespace */
550 start++;
551 }
552 while(i < UTOK_OPTION_COUNT) {
553 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
554 if(end - start > rulesOptions[i].optionLen) {
555 *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
556 while(u_isWhitespace(**optionArg)) { /* eat whitespace */
557 (*optionArg)++;
558 }
374ca955 559 }
b75a7d8f
A
560 break;
561 }
562 i++;
563 }
564 if(i == UTOK_OPTION_COUNT) {
565 i = -1; // didn't find an option
374ca955 566 }
b75a7d8f
A
567 return i;
568}
569
570
571// reads and conforms to various options in rules
572// end is the position of the first closing ']'
573// However, some of the options take an UnicodeSet definition
574// which needs to duplicate the closing ']'
575// for example: '[copy [\uAC00-\uD7FF]]'
374ca955 576// These options will move end to the second ']' and the
b75a7d8f
A
577// caller will set the current to it.
578static
579uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
580 const UChar* start = src->current;
581 int32_t i = 0;
582 int32_t j=0;
583 const UChar *optionArg = NULL;
584
585 uint8_t result = 0;
586
b75a7d8f
A
587 start++; /*skip opening '['*/
588 i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
589 if(optionArg) {
590 src->current = optionArg;
591 }
592
593 if(i < 0) {
594 *status = U_ILLEGAL_ARGUMENT_ERROR;
595 } else {
596 int32_t noOpenBraces = 1;
597 switch(i) {
598 case OPTION_ALTERNATE_HANDLING:
599 case OPTION_FRENCH_COLLATION:
600 case OPTION_CASE_LEVEL:
601 case OPTION_CASE_FIRST:
602 case OPTION_NORMALIZATION_MODE:
603 case OPTION_HIRAGANA_QUATERNARY:
604 case OPTION_STRENGTH:
605 case OPTION_NUMERIC_COLLATION:
606 if(optionArg) {
607 for(j = 0; j<rulesOptions[i].subSize; j++) {
608 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
609 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
610 result = UCOL_TOK_SUCCESS;
611 }
612 }
374ca955 613 }
b75a7d8f
A
614 if(result == 0) {
615 *status = U_ILLEGAL_ARGUMENT_ERROR;
616 }
617 break;
618 case OPTION_VARIABLE_TOP:
619 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
620 break;
621 case OPTION_REARRANGE:
622 result = UCOL_TOK_SUCCESS;
623 break;
624 case OPTION_BEFORE:
625 if(optionArg) {
626 for(j = 0; j<rulesOptions[i].subSize; j++) {
627 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
628 result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
629 }
630 }
631 }
632 if(result == 0) {
633 *status = U_ILLEGAL_ARGUMENT_ERROR;
634 }
635 break;
636 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
637 /* index to this array will be src->parsedToken.indirectIndex*/
638 src->parsedToken.indirectIndex = 0;
639 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
640 break;
641 case OPTION_FIRST:
642 case OPTION_LAST: /* first, last */
643 for(j = 0; j<rulesOptions[i].subSize; j++) {
644 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
645 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
646 // element of indirect boundaries is reserved for top.
647 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
648 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
649 }
650 }
651 if(result == 0) {
652 *status = U_ILLEGAL_ARGUMENT_ERROR;
653 }
654 break;
655 case OPTION_OPTIMIZE:
656 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
657 // we need to move end here
658 src->current++; // skip opening brace
659 while(src->current < src->end && noOpenBraces != 0) {
660 if(*src->current == 0x005b) {
661 noOpenBraces++;
662 } else if(*src->current == 0x005D) { // closing brace
663 noOpenBraces--;
664 }
665 src->current++;
666 }
667 result = UCOL_TOK_SUCCESS;
668 break;
669 default:
670 *status = U_UNSUPPORTED_ERROR;
671 break;
672 }
673 }
674 src->current = u_memchr(src->current, 0x005d, src->end-src->current);
675 return result;
676}
677
374ca955
A
678
679inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
680 if(src->extraCurrent+len >= src->extraEnd) {
681 /* reallocate */
682 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
683 if(newSrc != NULL) {
684 src->current = newSrc + (src->current - src->source);
685 src->extraCurrent = newSrc + (src->extraCurrent - src->source);
686 src->end = newSrc + (src->end - src->source);
687 src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
688 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
689 src->source = newSrc;
690 } else {
691 *status = U_MEMORY_ALLOCATION_ERROR;
692 }
693 }
694 if(len == 1) {
695 *src->extraCurrent++ = *stuff;
696 } else {
697 uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
698 src->extraCurrent += len;
699 }
700
701
702}
703
704inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
b75a7d8f
A
705 /*
706 top = TRUE;
707 */
374ca955 708 UChar buff[5];
b75a7d8f 709 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
374ca955
A
710 buff[0] = 0xFFFE;
711 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
712 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
b75a7d8f
A
713 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
714 src->parsedToken.charsLen = 3;
374ca955 715 ucol_tok_addToExtraCurrent(src, buff, 3, status);
b75a7d8f 716 } else {
374ca955
A
717 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
718 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
b75a7d8f 719 src->parsedToken.charsLen = 5;
374ca955
A
720 ucol_tok_addToExtraCurrent(src, buff, 5, status);
721 }
b75a7d8f
A
722 return TRUE;
723}
724
374ca955
A
725static UBool isCharNewLine(UChar c){
726 switch(c){
727 case 0x000A: /* LF */
728 case 0x000D: /* CR */
729 case 0x000C: /* FF */
730 case 0x0085: /* NEL */
731 case 0x2028: /* LS */
732 case 0x2029: /* PS */
733 return TRUE;
734 default:
735 return FALSE;
736 }
737}
738
b75a7d8f 739U_CAPI const UChar* U_EXPORT2
374ca955 740ucol_tok_parseNextToken(UColTokenParser *src,
b75a7d8f
A
741 UBool startOfRules,
742 UParseError *parseError,
374ca955 743 UErrorCode *status) {
b75a7d8f
A
744/* parsing part */
745 UBool variableTop = FALSE;
746 UBool top = FALSE;
747 UBool inChars = TRUE;
748 UBool inQuote = FALSE;
749 UBool wasInQuote = FALSE;
750 UChar *optionEnd = NULL;
751 uint8_t before = 0;
752 UBool isEscaped = FALSE;
753 // TODO: replace these variables with src->parsedToken counterparts
754 // no need to use them anymore since we have src->parsedToken.
755 // Ideally, token parser would be a nice class... Once, when I have
756 // more time (around 2020 probably).
757 uint32_t newExtensionLen = 0;
758 uint32_t extensionOffset = 0;
374ca955
A
759 uint32_t newStrength = UCOL_TOK_UNSET;
760 UChar buff[10];
b75a7d8f
A
761
762 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
763 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
764 src->parsedToken.indirectIndex = 0;
765
766 while (src->current < src->end) {
767 UChar ch = *(src->current);
768
769 if (inQuote) {
770 if (ch == 0x0027/*'\''*/) {
771 inQuote = FALSE;
772 } else {
773 if ((src->parsedToken.charsLen == 0) || inChars) {
774 if(src->parsedToken.charsLen == 0) {
775 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
776 }
777 src->parsedToken.charsLen++;
778 } else {
779 if(newExtensionLen == 0) {
780 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
781 }
782 newExtensionLen++;
783 }
784 }
785 }else if(isEscaped){
786 isEscaped =FALSE;
787 if (newStrength == UCOL_TOK_UNSET) {
374ca955 788 *status = U_INVALID_FORMAT_ERROR;
b75a7d8f
A
789 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
790 return NULL;
791 // enabling rules to start with non-tokens a < b
374ca955 792 // newStrength = UCOL_TOK_RESET;
b75a7d8f
A
793 }
794 if(ch != 0x0000 && src->current != src->end) {
795 if (inChars) {
796 if(src->parsedToken.charsLen == 0) {
797 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
798 }
799 src->parsedToken.charsLen++;
800 } else {
801 if(newExtensionLen == 0) {
802 extensionOffset = (uint32_t)(src->current - src->source);
803 }
804 newExtensionLen++;
805 }
806 }
807 }else {
808 if(!uprv_isRuleWhiteSpace(ch)) {
809 /* Sets the strength for this entry */
810 switch (ch) {
374ca955 811 case 0x003D/*'='*/ :
b75a7d8f
A
812 if (newStrength != UCOL_TOK_UNSET) {
813 goto EndOfLoop;
814 }
815
816 /* if we start with strength, we'll reset to top */
817 if(startOfRules == TRUE) {
818 src->parsedToken.indirectIndex = 5;
374ca955 819 top = ucol_tok_doSetTop(src, status);
b75a7d8f
A
820 newStrength = UCOL_TOK_RESET;
821 goto EndOfLoop;
822 }
823 newStrength = UCOL_IDENTICAL;
824 break;
825
374ca955 826 case 0x002C/*','*/:
b75a7d8f
A
827 if (newStrength != UCOL_TOK_UNSET) {
828 goto EndOfLoop;
829 }
830
831 /* if we start with strength, we'll reset to top */
832 if(startOfRules == TRUE) {
833 src->parsedToken.indirectIndex = 5;
374ca955 834 top = ucol_tok_doSetTop(src, status);
b75a7d8f
A
835 newStrength = UCOL_TOK_RESET;
836 goto EndOfLoop;
837 }
838 newStrength = UCOL_TERTIARY;
839 break;
840
841 case 0x003B/*';'*/:
842 if (newStrength != UCOL_TOK_UNSET) {
843 goto EndOfLoop;
844 }
845
846 /* if we start with strength, we'll reset to top */
847 if(startOfRules == TRUE) {
848 src->parsedToken.indirectIndex = 5;
374ca955 849 top = ucol_tok_doSetTop(src, status);
b75a7d8f
A
850 newStrength = UCOL_TOK_RESET;
851 goto EndOfLoop;
852 }
853 newStrength = UCOL_SECONDARY;
854 break;
855
374ca955 856 case 0x003C/*'<'*/:
b75a7d8f
A
857 if (newStrength != UCOL_TOK_UNSET) {
858 goto EndOfLoop;
859 }
860
861 /* if we start with strength, we'll reset to top */
862 if(startOfRules == TRUE) {
863 src->parsedToken.indirectIndex = 5;
374ca955 864 top = ucol_tok_doSetTop(src, status);
b75a7d8f
A
865 newStrength = UCOL_TOK_RESET;
866 goto EndOfLoop;
867 }
868 /* before this, do a scan to verify whether this is */
869 /* another strength */
870 if(*(src->current+1) == 0x003C) {
871 src->current++;
872 if(*(src->current+1) == 0x003C) {
873 src->current++; /* three in a row! */
874 newStrength = UCOL_TERTIARY;
875 } else { /* two in a row */
876 newStrength = UCOL_SECONDARY;
877 }
878 } else { /* just one */
879 newStrength = UCOL_PRIMARY;
880 }
881 break;
882
374ca955 883 case 0x0026/*'&'*/:
b75a7d8f
A
884 if (newStrength != UCOL_TOK_UNSET) {
885 /**/
886 goto EndOfLoop;
887 }
888
889 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
890 break;
891
892 case 0x005b/*'['*/:
893 /* options - read an option, analyze it */
894 if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
895 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
896 //src->current = optionEnd;
897 if(U_SUCCESS(*status)) {
898 if(result & UCOL_TOK_TOP) {
374ca955
A
899 if(newStrength == UCOL_TOK_RESET) {
900 top = ucol_tok_doSetTop(src, status);
b75a7d8f 901 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
b75a7d8f 902 src->parsedToken.charsLen+=2;
374ca955
A
903 buff[0] = 0x002d;
904 buff[1] = before;
905 ucol_tok_addToExtraCurrent(src, buff, 2, status);
b75a7d8f
A
906 }
907
908 src->current++;
909 goto EndOfLoop;
910 } else {
911 *status = U_INVALID_FORMAT_ERROR;
912 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
913 }
914 } else if(result & UCOL_TOK_VARIABLE_TOP) {
915 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
916 variableTop = TRUE;
917 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
918 src->parsedToken.charsLen = 1;
374ca955
A
919 buff[0] = 0xFFFF;
920 ucol_tok_addToExtraCurrent(src, buff, 1, status);
b75a7d8f
A
921 src->current++;
922 goto EndOfLoop;
923 } else {
924 *status = U_INVALID_FORMAT_ERROR;
925 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
926 }
927 } else if (result & UCOL_TOK_BEFORE){
928 if(newStrength == UCOL_TOK_RESET) {
929 before = result & UCOL_TOK_BEFORE;
930 } else {
931 *status = U_INVALID_FORMAT_ERROR;
932 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
933
934 }
374ca955 935 }
b75a7d8f
A
936 } else {
937 *status = U_INVALID_FORMAT_ERROR;
938 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
939 return NULL;
940 }
941 }
942 break;
374ca955
A
943 case 0x0021/*! skip java thai modifier reordering*/:
944 break;
b75a7d8f
A
945 case 0x002F/*'/'*/:
946 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
947 inChars = FALSE; /* we're now processing expansion */
948 break;
949 case 0x005C /* back slash for escaped chars */:
950 isEscaped = TRUE;
951 break;
952 /* found a quote, we're gonna start copying */
953 case 0x0027/*'\''*/:
954 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
374ca955 955 *status = U_INVALID_FORMAT_ERROR;
b75a7d8f
A
956 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
957 return NULL;
374ca955 958 // enabling rules to start with a non-token character a < b
b75a7d8f
A
959 // newStrength = UCOL_TOK_RESET;
960 }
961
962 inQuote = TRUE;
963
964 if(inChars) { /* we're doing characters */
965 if(wasInQuote == FALSE) {
966 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
967 }
968 if (src->parsedToken.charsLen != 0) {
374ca955 969 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
b75a7d8f
A
970 }
971 src->parsedToken.charsLen++;
972 } else { /* we're doing an expansion */
973 if(wasInQuote == FALSE) {
974 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
975 }
976 if (newExtensionLen != 0) {
374ca955 977 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
b75a7d8f
A
978 }
979 newExtensionLen++;
980 }
981
982 wasInQuote = TRUE;
983
374ca955 984 ch = *(++(src->current));
b75a7d8f 985 if(ch == 0x0027) { /* copy the double quote */
374ca955 986 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
b75a7d8f
A
987 inQuote = FALSE;
988 }
989 break;
990
991 /* '@' is french only if the strength is not currently set */
992 /* if it is, it's just a regular character in collation rules */
993 case 0x0040/*'@'*/:
994 if (newStrength == UCOL_TOK_UNSET) {
995 src->opts->frenchCollation = UCOL_ON;
996 break;
997 }
998
999 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1000 // we want to store read characters to the prefix part and continue reading
1001 // the characters (proper way would be to restart reading the chars, but in
374ca955 1002 // that case we would have to complicate the token hasher, which I do not
b75a7d8f
A
1003 // intend to play with. Instead, we will do prefixes when prefixes are due
1004 // (before adding the elements).
1005 src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1006 src->parsedToken.prefixLen = src->parsedToken.charsLen;
1007
1008 if(inChars) { /* we're doing characters */
1009 if(wasInQuote == FALSE) {
1010 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1011 }
1012 if (src->parsedToken.charsLen != 0) {
374ca955 1013 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
b75a7d8f
A
1014 }
1015 src->parsedToken.charsLen++;
1016 }
1017
1018 wasInQuote = TRUE;
1019
1020 do {
374ca955 1021 ch = *(++(src->current));
b75a7d8f
A
1022 // skip whitespace between '|' and the character
1023 } while (uprv_isRuleWhiteSpace(ch));
1024 break;
374ca955 1025
b75a7d8f
A
1026 //charsOffset = 0;
1027 //newCharsLen = 0;
1028 //break; // We want to store the whole prefix/character sequence. If we break
1029 // the '|' is going to get lost.
374ca955
A
1030 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1031 do {
1032 ch = *(++(src->current));
1033 } while (!isCharNewLine(ch));
1034
1035 break;
b75a7d8f
A
1036 default:
1037 if (newStrength == UCOL_TOK_UNSET) {
374ca955 1038 *status = U_INVALID_FORMAT_ERROR;
b75a7d8f
A
1039 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1040 return NULL;
1041 }
1042
1043 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1044 *status = U_INVALID_FORMAT_ERROR;
1045 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1046 return NULL;
1047 }
1048
1049 if(ch == 0x0000 && src->current+1 == src->end) {
1050 break;
1051 }
1052
1053 if (inChars) {
1054 if(src->parsedToken.charsLen == 0) {
1055 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1056 }
1057 src->parsedToken.charsLen++;
1058 } else {
1059 if(newExtensionLen == 0) {
1060 extensionOffset = (uint32_t)(src->current - src->source);
1061 }
1062 newExtensionLen++;
1063 }
1064
1065 break;
374ca955 1066 }
b75a7d8f
A
1067 }
1068 }
1069
1070 if(wasInQuote) {
1071 if(ch != 0x27) {
374ca955
A
1072 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1073 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1074 }
b75a7d8f
A
1075 }
1076 }
1077
1078 src->current++;
1079 }
1080
1081 EndOfLoop:
1082 wasInQuote = FALSE;
1083 if (newStrength == UCOL_TOK_UNSET) {
1084 return NULL;
1085 }
1086
1087 if (src->parsedToken.charsLen == 0 && top == FALSE) {
374ca955 1088 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
b75a7d8f
A
1089 *status = U_INVALID_FORMAT_ERROR;
1090 return NULL;
1091 }
1092
374ca955 1093 src->parsedToken.strength = newStrength;
b75a7d8f
A
1094 src->parsedToken.extensionOffset = extensionOffset;
1095 src->parsedToken.extensionLen = newExtensionLen;
1096 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1097
1098 return src->current;
1099}
1100
1101/*
1102Processing Description
374ca955
A
1103 1 Build a ListList. Each list has a header, which contains two lists (positive
1104 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1105 reset may be null.
1106 2 As you process, you keep a LAST pointer that points to the last token you
1107 handled.
b75a7d8f
A
1108*/
1109
1110static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
1111 UParseError *parseError, UErrorCode *status) {
1112 if(src->resultLen == src->listCapacity) {
1113 // Unfortunately, this won't work, as we store addresses of lhs in token
1114 src->listCapacity *= 2;
1115 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1116 if(src->lh == NULL) {
1117 *status = U_MEMORY_ALLOCATION_ERROR;
1118 return NULL;
1119 }
1120 }
1121 /* do the reset thing */
1122 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1123 /* test for NULL */
1124 if (sourceToken == NULL) {
1125 *status = U_MEMORY_ALLOCATION_ERROR;
1126 return NULL;
1127 }
1128 sourceToken->rulesToParse = src->source;
1129 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1130 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1131
1132 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1133 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1134
374ca955
A
1135 // keep the flags around so that we know about before
1136 sourceToken->flags = src->parsedToken.flags;
1137
b75a7d8f 1138 if(src->parsedToken.prefixOffset != 0) {
374ca955 1139 // this is a syntax error
b75a7d8f
A
1140 *status = U_INVALID_FORMAT_ERROR;
1141 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1142 return 0;
1143 } else {
1144 sourceToken->prefix = 0;
1145 }
1146
1147 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1148 sourceToken->strength = UCOL_TOK_RESET;
1149 sourceToken->next = NULL;
1150 sourceToken->previous = NULL;
1151 sourceToken->noOfCEs = 0;
1152 sourceToken->noOfExpCEs = 0;
1153 sourceToken->listHeader = &src->lh[src->resultLen];
1154
1155 src->lh[src->resultLen].first = NULL;
1156 src->lh[src->resultLen].last = NULL;
1157 src->lh[src->resultLen].first = NULL;
1158 src->lh[src->resultLen].last = NULL;
1159
1160 src->lh[src->resultLen].reset = sourceToken;
1161
1162 /*
374ca955
A
1163 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1164 First convert all expansions into normal form. Examples:
1165 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1166 d * ... into &x * c/y * d * ...
1167 Note: reset values can never have expansions, although they can cause the
1168 very next item to have one. They may be contractions, if they are found
1169 earlier in the list.
b75a7d8f
A
1170 */
1171 if(expand != NULL) {
1172 /* check to see if there is an expansion */
1173 if(src->parsedToken.charsLen > 1) {
1174 uint32_t resetCharsOffset;
1175 resetCharsOffset = (uint32_t)(expand - src->source);
1176 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1177 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1178 } else {
1179 *expandNext = 0;
1180 }
1181 }
1182
1183 src->resultLen++;
1184
1185 uhash_put(src->tailored, sourceToken, sourceToken, status);
1186
1187 return sourceToken;
1188}
1189
1190static
1191inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1192 if(U_FAILURE(*status)) {
1193 return NULL;
1194 }
1195 /* this is a virgin before - we need to fish the anchor from the UCA */
1196 collIterate s;
1197 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1198 uint32_t CE, SecondCE;
1199 uint32_t invPos;
1200 if(sourceToken != NULL) {
374ca955 1201 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
b75a7d8f 1202 } else {
374ca955 1203 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
b75a7d8f
A
1204 }
1205
1206 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1207 baseContCE = ucol_getNextCE(src->UCA, &s, status);
1208 if(baseContCE == UCOL_NO_MORE_CES) {
1209 baseContCE = 0;
1210 }
1211
374ca955
A
1212
1213 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1214 uint32_t ch = 0;
b75a7d8f 1215 uint32_t expandNext = 0;
374ca955
A
1216 UColToken key;
1217
1218 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1219 uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1220 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1221 ch = uprv_uca_getCodePointFromRaw(raw-1);
1222 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1223 CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1224 SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1225
1226 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1227 *src->extraCurrent++ = 0xFFFE;
1228 *src->extraCurrent++ = (UChar)ch;
1229 src->parsedToken.charsLen++;
1230
1231 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1232 key.rulesToParse = src->source;
1233
1234 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1235 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1236
1237 if(sourceToken == NULL) {
1238 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1239 if(isContinuation(SecondCE)) {
1240 src->lh[src->resultLen].baseContCE = SecondCE;
1241 } else {
1242 src->lh[src->resultLen].baseContCE = 0;
1243 }
1244 src->lh[src->resultLen].nextCE = 0;
1245 src->lh[src->resultLen].nextContCE = 0;
1246 src->lh[src->resultLen].previousCE = 0;
1247 src->lh[src->resultLen].previousContCE = 0;
1248
1249 src->lh[src->resultLen].indirect = FALSE;
1250
1251 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1252 }
1253
1254 } else {
1255 invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1256
1257 // we got the previous CE. Now we need to see if the difference between
1258 // the two CEs is really of the requested strength.
1259 // if it's a bigger difference (we asked for secondary and got primary), we
1260 // need to modify the CE.
1261 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1262 // adjust the strength
1263 // now we are in the situation where our baseCE should actually be modified in
1264 // order to get the CE in the right position.
1265 if(strength == UCOL_SECONDARY) {
1266 CE = baseCE - 0x0200;
1267 } else { // strength == UCOL_TERTIARY
1268 CE = baseCE - 0x02;
1269 }
1270 if(baseContCE) {
1271 if(strength == UCOL_SECONDARY) {
1272 SecondCE = baseContCE - 0x0200;
1273 } else { // strength == UCOL_TERTIARY
1274 SecondCE = baseContCE - 0x02;
1275 }
1276 }
1277 }
1278
1279#if 0
1280 // the code below relies on getting a code point from the inverse table, in order to be
1281 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1282 // 1. There are many code points that have the same CE
1283 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1284 // Also, in case when there is no equivalent strength before an element, we have to actually
1285 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1286 // before a is a primary difference.
1287
1288 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1289
1290
1291 ch = CETable[3*invPos+2];
1292
1293 if((ch & UCOL_INV_SIZEMASK) != 0) {
1294 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1295 uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1296 ch = conts[offset];
1297 }
1298
1299 *src->extraCurrent++ = (UChar)ch;
1300 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1301 src->parsedToken.charsLen = 1;
1302
1303 // We got an UCA before. However, this might have been tailored.
1304 // example:
1305 // &\u30ca = \u306a
1306 // &[before 3]\u306a<<<\u306a|\u309d
1307
1308
1309 // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1310 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1311 key.rulesToParse = src->source;
1312
1313 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1314 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1315#endif
1316
1317 // here is how it should be. The situation such as &[before 1]a < x, should be
1318 // resolved exactly as if we wrote &a > x.
1319 // therefore, I don't really care if the UCA value before a has been changed.
1320 // However, I do care if the strength between my element and the previous element
1321 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1322 // have to construct the base CE.
1323
1324
1325
1326 // if we found a tailored thing, we have to use the UCA value and construct
1327 // a new reset token with constructed name
1328 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1329 // character to which we want to anchor is already tailored.
1330 // We need to construct a new token which will be the anchor
1331 // point
1332 //*(src->extraCurrent-1) = 0xFFFE;
1333 //*src->extraCurrent++ = (UChar)ch;
1334 // grab before
1335 src->parsedToken.charsOffset -= 10;
1336 src->parsedToken.charsLen += 10;
1337 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1338 if(isContinuation(SecondCE)) {
1339 src->lh[src->resultLen].baseContCE = SecondCE;
1340 } else {
1341 src->lh[src->resultLen].baseContCE = 0;
1342 }
1343 src->lh[src->resultLen].nextCE = 0;
1344 src->lh[src->resultLen].nextContCE = 0;
1345 src->lh[src->resultLen].previousCE = 0;
1346 src->lh[src->resultLen].previousContCE = 0;
b75a7d8f 1347
374ca955 1348 src->lh[src->resultLen].indirect = FALSE;
b75a7d8f 1349
374ca955
A
1350 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1351 //}
b75a7d8f
A
1352 }
1353
1354 return sourceToken;
1355
1356}
1357
1358uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1359 UColToken *lastToken = NULL;
1360 const UChar *parseEnd = NULL;
1361 uint32_t expandNext = 0;
1362 UBool variableTop = FALSE;
1363 UBool top = FALSE;
1364 uint16_t specs = 0;
1365 UColTokListHeader *ListList = NULL;
1366
374ca955 1367 src->parsedToken.strength = UCOL_TOK_UNSET;
b75a7d8f
A
1368
1369 ListList = src->lh;
1370
1371 if(U_FAILURE(*status)) {
1372 return 0;
1373 }
1374
1375 while(src->current < src->end) {
1376 src->parsedToken.prefixOffset = 0;
374ca955
A
1377
1378 parseEnd = ucol_tok_parseNextToken(src,
b75a7d8f
A
1379 (UBool)(lastToken == NULL),
1380 parseError,
1381 status);
1382
1383 specs = src->parsedToken.flags;
1384
1385
1386 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1387 top = ((specs & UCOL_TOK_TOP) != 0);
1388
1389 if(U_SUCCESS(*status) && parseEnd != NULL) {
1390 UColToken *sourceToken = NULL;
1391 //uint32_t key = 0;
1392 uint32_t lastStrength = UCOL_TOK_UNSET;
374ca955 1393
b75a7d8f
A
1394 if(lastToken != NULL ) {
1395 lastStrength = lastToken->strength;
1396 }
1397
1398 //key = newCharsLen << 24 | charsOffset;
1399 UColToken key;
1400 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1401 key.rulesToParse = src->source;
1402
1403 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
1404 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1405
1406 if(src->parsedToken.strength != UCOL_TOK_RESET) {
1407 if(lastToken == NULL) { /* this means that rules haven't started properly */
1408 *status = U_INVALID_FORMAT_ERROR;
1409 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1410 return 0;
1411 }
1412 /* 6 Otherwise (when relation != reset) */
1413 if(sourceToken == NULL) {
1414 /* If sourceToken is null, create new one, */
1415 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1416 /* test for NULL */
1417 if (sourceToken == NULL) {
1418 *status = U_MEMORY_ALLOCATION_ERROR;
1419 return 0;
1420 }
1421 sourceToken->rulesToParse = src->source;
1422 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1423
1424 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1425
1426 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1427 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1428
1429 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1430 sourceToken->next = NULL;
1431 sourceToken->previous = NULL;
1432 sourceToken->noOfCEs = 0;
1433 sourceToken->noOfExpCEs = 0;
374ca955
A
1434 // keep the flags around so that we know about before
1435 sourceToken->flags = src->parsedToken.flags;
b75a7d8f
A
1436 uhash_put(src->tailored, sourceToken, sourceToken, status);
1437 } else {
1438 /* we could have fished out a reset here */
1439 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1440 /* otherwise remove sourceToken from where it was. */
1441 if(sourceToken->next != NULL) {
1442 if(sourceToken->next->strength > sourceToken->strength) {
1443 sourceToken->next->strength = sourceToken->strength;
1444 }
1445 sourceToken->next->previous = sourceToken->previous;
1446 } else {
1447 sourceToken->listHeader->last = sourceToken->previous;
1448 }
1449
1450 if(sourceToken->previous != NULL) {
1451 sourceToken->previous->next = sourceToken->next;
1452 } else {
1453 sourceToken->listHeader->first = sourceToken->next;
1454 }
1455 sourceToken->next = NULL;
1456 sourceToken->previous = NULL;
1457 }
1458 }
1459
1460 sourceToken->strength = src->parsedToken.strength;
1461 sourceToken->listHeader = lastToken->listHeader;
1462
1463 /*
374ca955
A
1464 1. Find the strongest strength in each list, and set strongestP and strongestN
1465 accordingly in the headers.
b75a7d8f 1466 */
374ca955 1467 if(lastStrength == UCOL_TOK_RESET
b75a7d8f 1468 || sourceToken->listHeader->first == 0) {
374ca955 1469 /* If LAST is a reset
b75a7d8f
A
1470 insert sourceToken in the list. */
1471 if(sourceToken->listHeader->first == 0) {
1472 sourceToken->listHeader->first = sourceToken;
1473 sourceToken->listHeader->last = sourceToken;
1474 } else { /* we need to find a place for us */
1475 /* and we'll get in front of the same strength */
1476 if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1477 sourceToken->next = sourceToken->listHeader->first;
1478 sourceToken->next->previous = sourceToken;
1479 sourceToken->listHeader->first = sourceToken;
1480 sourceToken->previous = NULL;
1481 } else {
1482 lastToken = sourceToken->listHeader->first;
1483 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1484 lastToken = lastToken->next;
1485 }
1486 if(lastToken->next != NULL) {
1487 lastToken->next->previous = sourceToken;
1488 } else {
1489 sourceToken->listHeader->last = sourceToken;
1490 }
1491 sourceToken->previous = lastToken;
1492 sourceToken->next = lastToken->next;
1493 lastToken->next = sourceToken;
1494 }
1495 }
1496 } else {
374ca955
A
1497 /* Otherwise (when LAST is not a reset)
1498 if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1499 otherwise insert before.
1500 when inserting after or before, search to the next position with the same
b75a7d8f 1501 strength in that direction. (This is called postpone insertion). */
374ca955 1502 if(sourceToken != lastToken) {
b75a7d8f
A
1503 if(lastToken->polarity == sourceToken->polarity) {
1504 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1505 lastToken = lastToken->next;
1506 }
1507 sourceToken->previous = lastToken;
1508 if(lastToken->next != NULL) {
1509 lastToken->next->previous = sourceToken;
1510 } else {
1511 sourceToken->listHeader->last = sourceToken;
1512 }
1513
1514 sourceToken->next = lastToken->next;
1515 lastToken->next = sourceToken;
1516 } else {
1517 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1518 lastToken = lastToken->previous;
1519 }
1520 sourceToken->next = lastToken;
1521 if(lastToken->previous != NULL) {
1522 lastToken->previous->next = sourceToken;
1523 } else {
1524 sourceToken->listHeader->first = sourceToken;
1525 }
1526 sourceToken->previous = lastToken->previous;
1527 lastToken->previous = sourceToken;
1528 }
1529 } else { /* repeated one thing twice in rules, stay with the stronger strength */
1530 if(lastStrength < sourceToken->strength) {
1531 sourceToken->strength = lastStrength;
1532 }
1533 }
1534 }
1535
1536 /* if the token was a variable top, we're gonna put it in */
1537 if(variableTop == TRUE && src->varTop == NULL) {
1538 variableTop = FALSE;
1539 src->varTop = sourceToken;
1540 }
1541
1542 // Treat the expansions.
374ca955
A
1543 // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1544 // (&abc * d * e <=> &ab * d / c * e / c)
b75a7d8f
A
1545 // if both of them are in effect for a token, they are combined.
1546
1547 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1548
1549 if(expandNext != 0) {
1550 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1551 expandNext = 0;
1552 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1553 sourceToken->expansion = expandNext;
1554 } else { /* there is both explicit and implicit expansion. We need to make a combination */
1555 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1556 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1557 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (src->extraCurrent - src->source));
1558 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1559 }
1560 }
1561
1562 // This is just for debugging purposes
1563 if(sourceToken->expansion != 0) {
1564 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1565 } else {
1566 sourceToken->debugExpansion = 0;
1567 }
374ca955
A
1568 // if the previous token was a reset before, the strength of this
1569 // token must match the strength of before. Otherwise we have an
1570 // undefined situation.
1571 // In other words, we currently have a cludge which we use to
1572 // represent &a >> x. This is written as &[before 2]a << x.
1573 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1574 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1575 if(beforeStrength != sourceToken->strength) {
1576 *status = U_INVALID_FORMAT_ERROR;
1577 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1578 return 0;
1579 }
1580 }
b75a7d8f
A
1581 } else {
1582 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1583 /* if the previous token was also a reset, */
1584 /*this means that we have two consecutive resets */
1585 /* and we want to remove the previous one if empty*/
1586 if(ListList[src->resultLen-1].first == NULL) {
1587 src->resultLen--;
1588 }
1589 }
1590
1591 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1592 uint32_t searchCharsLen = src->parsedToken.charsLen;
1593 while(searchCharsLen > 1 && sourceToken == NULL) {
1594 searchCharsLen--;
1595 //key = searchCharsLen << 24 | charsOffset;
1596 UColToken key;
1597 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1598 key.rulesToParse = src->source;
1599 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1600 }
1601 if(sourceToken != NULL) {
1602 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1603 }
1604 }
1605
1606 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1607 if(top == FALSE) { /* there is no indirection */
1608 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
374ca955 1609 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
b75a7d8f
A
1610 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1611 while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1612 sourceToken = sourceToken->previous;
1613 }
1614 /* here, either we hit the strength or NULL */
1615 if(sourceToken->strength == strength) {
1616 if(sourceToken->previous != NULL) {
1617 sourceToken = sourceToken->previous;
1618 } else { /* start of list */
1619 sourceToken = sourceToken->listHeader->reset;
374ca955 1620 }
b75a7d8f
A
1621 } else { /* we hit NULL */
1622 /* we should be doing the else part */
1623 sourceToken = sourceToken->listHeader->reset;
1624 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1625 }
1626 } else {
1627 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1628 }
1629 } else { /* this is both before and indirection */
1630 top = FALSE;
1631 ListList[src->resultLen].previousCE = 0;
1632 ListList[src->resultLen].previousContCE = 0;
1633 ListList[src->resultLen].indirect = TRUE;
1634 /* we need to do slightly more work. we need to get the baseCE using the */
1635 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
1636 /* in ucol_bld */
1637 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1638 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1639 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
1640 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1641
374ca955
A
1642 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1643 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1644 uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1645 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1646 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1647 CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1648 SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1649 } else {
1650 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
1651 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1652 }
b75a7d8f
A
1653
1654 ListList[src->resultLen].baseCE = CE;
1655 ListList[src->resultLen].baseContCE = SecondCE;
1656 ListList[src->resultLen].nextCE = 0;
1657 ListList[src->resultLen].nextContCE = 0;
1658
1659 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1660 }
1661 }
1662
1663
374ca955
A
1664 /* 5 If the relation is a reset:
1665 If sourceToken is null
1666 Create new list, create new sourceToken, make the baseCE from source, put
b75a7d8f
A
1667 the sourceToken in ListHeader of the new list */
1668 if(sourceToken == NULL) {
1669 /*
374ca955
A
1670 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1671 First convert all expansions into normal form. Examples:
1672 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1673 d * ... into &x * c/y * d * ...
1674 Note: reset values can never have expansions, although they can cause the
1675 very next item to have one. They may be contractions, if they are found
1676 earlier in the list.
b75a7d8f
A
1677 */
1678 if(top == FALSE) {
1679 collIterate s;
1680 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1681
1682 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
1683
1684 CE = ucol_getNextCE(src->UCA, &s, status);
1685 UChar *expand = s.pos;
1686 SecondCE = ucol_getNextCE(src->UCA, &s, status);
1687
1688 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1689 if(isContinuation(SecondCE)) {
1690 ListList[src->resultLen].baseContCE = SecondCE;
1691 } else {
1692 ListList[src->resultLen].baseContCE = 0;
1693 }
1694 ListList[src->resultLen].nextCE = 0;
1695 ListList[src->resultLen].nextContCE = 0;
1696 ListList[src->resultLen].previousCE = 0;
1697 ListList[src->resultLen].previousContCE = 0;
1698 ListList[src->resultLen].indirect = FALSE;
1699 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
1700 } else { /* top == TRUE */
1701 /* just use the supplied values */
1702 top = FALSE;
1703 ListList[src->resultLen].previousCE = 0;
1704 ListList[src->resultLen].previousContCE = 0;
1705 ListList[src->resultLen].indirect = TRUE;
1706 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1707 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
1708 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
1709 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
1710
1711 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1712
1713 }
1714 } else { /* reset to something already in rules */
1715 top = FALSE;
1716 }
1717 }
374ca955 1718 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
b75a7d8f
A
1719 lastToken = sourceToken;
1720 } else {
1721 if(U_FAILURE(*status)) {
1722 return 0;
1723 }
1724 }
1725 }
1726
1727 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1728 src->resultLen--;
1729 }
1730 return src->resultLen;
1731}
1732
374ca955 1733void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
b75a7d8f
A
1734 uint32_t nSize = 0;
1735 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
1736 if(U_FAILURE(*status)) {
1737 return;
1738 }
374ca955 1739
b75a7d8f
A
1740 // set everything to zero, so that we can clean up gracefully
1741 uprv_memset(src, 0, sizeof(UColTokenParser));
374ca955 1742
b75a7d8f
A
1743 // first we need to find options that don't like to be normalized,
1744 // like copy and remove...
1745 //const UChar *openBrace = rules;
1746 int32_t optionNumber = -1;
1747 const UChar *setStart;
1748 uint32_t i = 0;
1749 while(i < rulesLength) {
1750 if(rules[i] == 0x005B) {
1751 // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
1752 //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
1753 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
1754 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
1755 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1756 if(U_SUCCESS(*status)) {
1757 if(src->copySet == NULL) {
1758 src->copySet = newSet;
1759 } else {
1760 ((UnicodeSet *)src->copySet)->addAll(*((UnicodeSet *)newSet));
1761 uset_close(newSet);
1762 }
1763 } else {
1764 return;
1765 }
1766 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
1767 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1768 if(U_SUCCESS(*status)) {
1769 if(src->removeSet == NULL) {
1770 src->removeSet = newSet;
1771 } else {
1772 ((UnicodeSet *)src->removeSet)->addAll(*((UnicodeSet *)newSet));
1773 uset_close(newSet);
1774 }
1775 } else {
1776 return;
1777 }
1778 }
1779 }
1780 //openBrace++;
1781 i++;
1782 }
1783
1784 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
1785 /* test for NULL */
1786 if (src->source == NULL) {
1787 *status = U_MEMORY_ALLOCATION_ERROR;
1788 return;
1789 }
374ca955 1790 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
b75a7d8f
A
1791 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
1792 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
1793 *status = U_ZERO_ERROR;
1794 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
1795 /* test for NULL */
1796 if (src->source == NULL) {
1797 *status = U_MEMORY_ALLOCATION_ERROR;
1798 return;
1799 }
1800 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
1801 }
1802 src->current = src->source;
1803 src->end = src->source+nSize;
1804 src->sourceCurrent = src->source;
374ca955 1805 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
b75a7d8f
A
1806 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1807 src->varTop = NULL;
1808 src->UCA = UCA;
1809 src->invUCA = ucol_initInverseUCA(status);
1810 src->parsedToken.charsLen = 0;
1811 src->parsedToken.charsOffset = 0;
1812 src->parsedToken.extensionLen = 0;
1813 src->parsedToken.extensionOffset = 0;
1814 src->parsedToken.prefixLen = 0;
1815 src->parsedToken.prefixOffset = 0;
1816 src->parsedToken.flags = 0;
1817 src->parsedToken.strength = UCOL_TOK_UNSET;
1818
1819
1820 if(U_FAILURE(*status)) {
1821 return;
1822 }
1823 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, status);
1824 if(U_FAILURE(*status)) {
1825 return;
1826 }
1827 uhash_setValueDeleter(src->tailored, uhash_freeBlock);
1828
1829 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
1830 /* test for NULL */
1831 if (src->opts == NULL) {
1832 *status = U_MEMORY_ALLOCATION_ERROR;
1833 return;
1834 }
1835
1836 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
1837
1838 // rulesToParse = src->source;
1839 src->lh = 0;
1840 src->listCapacity = 1024;
1841 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
1842 //Test for NULL
1843 if (src->lh == NULL) {
1844 *status = U_MEMORY_ALLOCATION_ERROR;
1845 return;
1846 }
374ca955 1847 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
b75a7d8f
A
1848 src->resultLen = 0;
1849
1850 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1851
1852 // UCOL_RESET_TOP_VALUE
374ca955 1853 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
b75a7d8f
A
1854 // UCOL_FIRST_PRIMARY_IGNORABLE
1855 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
1856 // UCOL_LAST_PRIMARY_IGNORABLE
1857 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
1858 // UCOL_FIRST_SECONDARY_IGNORABLE
1859 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
1860 // UCOL_LAST_SECONDARY_IGNORABLE
1861 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
1862 // UCOL_FIRST_TERTIARY_IGNORABLE
1863 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
1864 // UCOL_LAST_TERTIARY_IGNORABLE
1865 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
1866 // UCOL_FIRST_VARIABLE
1867 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
1868 // UCOL_LAST_VARIABLE
1869 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
1870 // UCOL_FIRST_NON_VARIABLE
1871 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
1872 // UCOL_LAST_NON_VARIABLE
1873 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1874 // UCOL_FIRST_IMPLICIT
1875 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
1876 // UCOL_LAST_IMPLICIT
1877 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
1878 // UCOL_FIRST_TRAILING
1879 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
1880 // UCOL_LAST_TRAILING
1881 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
1882 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
1883}
1884
1885
1886void ucol_tok_closeTokenList(UColTokenParser *src) {
1887 if(src->copySet != NULL) {
1888 uset_close(src->copySet);
1889 }
1890 if(src->removeSet != NULL) {
1891 uset_close(src->removeSet);
1892 }
1893 if(src->tailored != NULL) {
1894 uhash_close(src->tailored);
1895 }
1896 if(src->lh != NULL) {
1897 uprv_free(src->lh);
1898 }
1899 if(src->source != NULL) {
1900 uprv_free(src->source);
1901 }
1902 if(src->opts != NULL) {
1903 uprv_free(src->opts);
1904 }
1905}
1906
1907#endif /* #if !UCONFIG_NO_COLLATION */
1908